We upload the data for the protected variable Perceived Socioeconomic Status got during the preprocessing as a dataframe. The names of the columns are: name, text_x, text_y, Low qualification.

In [None]:
import pandas as pd
profesion_df = pd.read_excel('/content/ex_profesions.xlsx')
profesion_df

We create two dataframes, one for for profesions that require high qualification, and another one for those that require low qualification. The qualifications may be perceived as a socioeconomic status by medical staff.

In [None]:
high = profesion_df[profesion_df['Low qualification'] == 0]
low = profesion_df[profesion_df['Low qualification'] == 1]

## Descriptive statistics

We calculate the length of the text of perceived high socioeconomic patients' notes.

In [None]:
import matplotlib.pyplot as plt
high['text_length'] = high['text_y'].str.len()
high

high['text_length'].hist()
plt.title('Histogram of Text Lengths for perceived high socioeconomic patients')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')

print("Average length:", high['text_length'].mean())
print("Median length:", high['text_length'].median())

We count the average and media words of the perceived high socioeconomic patients' notes.

In [None]:
high['word_count'] = high['text_y'].apply(lambda x: len(str(x).split()))
high['word_count'].hist()

plt.title('Histogram word count for perceived high socioeconomic patients')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

print("Average length:", high['word_count'].mean())
print("Median length:", high['word_count'].median())

We calculate the length of the text of perceived low socioeconomic patients' notes.

In [None]:
low['text_length'] = low['text_y'].str.len()

low['text_length'].hist()
plt.title('Histogram of Text Lengths for perceived low socioeconomic patients')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')

print("Average length:", low['text_length'].mean())
print("Median length:", low['text_length'].median())

We count the average and media words of the perceived low socioeconomic patients' notes.

In [None]:
low['word_count'] = low['text_y'].apply(lambda x: len(str(x).split()))
low['word_count'].hist()

plt.title('Histogram word count for perceived low socioeconomic patients')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

print("Average length:", low['word_count'].mean())
print("Median length:", low['word_count'].median())

##Sentiment analysis





**1. Pretrained BERT Pysentimiento**

We install and import the necessary libraries.

In [None]:
!pip install pysentimiento

from pysentimiento import create_analyzer
import transformers

transformers.logging.set_verbosity(transformers.logging.ERROR)

analyzer = create_analyzer(task="sentiment", lang="es")

We apply RoBERTuito for the perceived high and low socioeconomic patients' notes, respectively.

In [None]:
###Sentiment in texts for perceived high socioeconomic patients:
positive_count = 0
neutral_count = 0
negative_count = 0

for index, row in high.iterrows():
    text = row['text_y']
    result_high = analyzer.predict(text)

    if result_high.output == 'POS':
        positive_count += 1
    elif result_high.output == 'NEU':
        neutral_count += 1
    elif result_high.output == 'NEG':
        negative_count += 1

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

In [None]:
###Sentiment in texts for perceived low socioeconomic patients:
positive_count = 0
neutral_count = 0
negative_count = 0

for index, row in low.iterrows():
    text = row['text_y']
    result_low = analyzer.predict(text)

    if result_low.output == 'POS':
        positive_count += 1
    elif result_low.output == 'NEU':
        neutral_count += 1
    elif result_low.output == 'NEG':
        negative_count += 1

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

**2. AFINN with Spanish Lexicon**

We install and import the necessary libraries.

In [None]:
!pip install textblob
from textblob import TextBlob, Word

!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
import os
stop_words = set(stopwords.words('spanish'))

Firstly, we initialise an empty dictionary to store FINN lexicon. Afterwards, we load the AFINN lexicon from a CSV file into a DataFrame. Only the first two columns are used. the, we iterate through each row of the DataFrame, extracting the word and its sentiment score.

In [None]:
afinn = {}

lexicon_df = pd.read_csv('/content/lexico_afinn.csv', header=0)
lexicon_df = lexicon_df.iloc[:, :2]

for index, row in lexicon_df.iterrows():
    word = row['palabra']
    sentiment_score_str = row['puntuacion']

    try:
        sentiment_score = float(sentiment_score_str)
        afinn[word] = sentiment_score
    except ValueError:
        print(f"Invalid sentiment score '{sentiment_score_str}' for word '{word}'. Skipping row.")

We define a function that calculates the sentiment of the text. First, convert the text to lowercase and split it into words. Then, calculate the sentiment score by summing the scores of individual words using the AFINN dictionary.

In [None]:
def calculate_sentiment(text):
    words = text.lower().split()
    sentiment_score = sum(afinn.get(word, 0) for word in words)
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

We also upload stopwords for Catalan and add them to those for Spanish.

In [None]:
catalan_stopwords = catalan_stopwords = [
    'de', 'es', 'i', 'a', 'o', 'un', 'una', 'unes', 'uns', 'un', 'tot',
    'també', 'altre', 'algun', 'alguna', 'alguns', 'algunes', 'ser', 'és',
    'soc', 'ets', 'som', 'estic', 'està', 'estem', 'esteu', 'estan', 'com',
    'en', 'per', 'perquè', 'per que', 'estat', 'estava', 'ans', 'abans',
    'éssent', 'ambdós', 'però', 'per', 'poder', 'potser', 'puc', 'podem',
    'podeu', 'poden', 'vaig', 'va', 'van', 'fer', 'faig', 'fa', 'fem',
    'feu', 'fan', 'cada', 'fi', 'inclòs', 'primer', 'des de', 'conseguir',
    'consegueixo', 'consigueix', 'consigueixes', 'conseguim', 'consigueixen',
    'anar', 'haver', 'tenir', 'tinc', 'te', 'tenim', 'teniu', 'tene', 'el',
    'la', 'les', 'els', 'seu', 'aquí', 'meu', 'teu', 'ells', 'elles', 'ens',
    'nosaltres', 'vosaltres', 'si', 'dins', 'sols', 'solament', 'saber',
    'saps', 'sap', 'sabem', 'sabeu', 'saben', 'últim', 'llarg', 'bastant',
    'fas', 'molts', 'aquells', 'aquelles', 'seus', 'llavors', 'sota', 'dalt',
    'ús', 'molt', 'era', 'eres', 'erem', 'eren', 'mode', 'bé', 'quant',
    'quan', 'on', 'mentre', 'qui', 'amb', 'entre', 'sense', 'jo', 'aquell'
]

stop_words.update(catalan_stopwords)

We define a function to preprocess text data. The text is converted to lower case, punctuation removed, tokenised using TextBlob, lemmatised, and stopwords removed.

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = TextBlob(text).words
    tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [Word(word).lemmatize() for word in tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

high['preprocessed_text'] = high['text_y'].apply(preprocess_text)
low['preprocessed_text'] = low['text_y'].apply(preprocess_text)

We apply the calculate_sentiment function to the 'preprocessed_text' column of the two dataframes and store the sentiment labels in a new column 'sentiment'.

In [None]:
high['sentiment'] = high['preprocessed_text'].apply(lambda text: calculate_sentiment(text))
low['sentiment'] = low['preprocessed_text'].apply(lambda text: calculate_sentiment(text))

We count the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') for women's notes.

In [None]:
sentiment_counts_high = high['sentiment'].value_counts()
sentiment_counts_high

We count the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') for older men's notes.

In [None]:
sentiment_counts_low = low['sentiment'].value_counts()
sentiment_counts_low

## Topic modelling and document embedding

**LDA**

We install and import the necessary libraries.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

We use LDA for topic modelling, following the following steps:

In [None]:
#Firstly, we tokenize the text
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

high['tokenized_text'] = high['preprocessed_text'].apply(tokenize_text)
low['tokenized_text'] = low['preprocessed_text'].apply(tokenize_text)

#We create dictionary representation of the documents
dictionary = Dictionary(high['tokenized_text'])
dictionary_un = Dictionary(low['tokenized_text'])

dictionary.filter_extremes(no_below=4, no_above=0.7)
dictionary_un.filter_extremes(no_below=4, no_above=0.7)

#We convert the dictionary to a bag of words corpus
corpus_high = [dictionary.doc2bow(doc) for doc in high['tokenized_text']]
corpus_low = [dictionary_un.doc2bow(doc) for doc in low['tokenized_text']]

Now we apply the LDA model, and represent 2 topics for women and men's notes, respectively:

In [None]:
num_topics = 2

# LDA in perceived high socioeconomic patients
lda_high = LdaModel(corpus_high, num_topics=num_topics, id2word=dictionary, passes=15)

# LDA in perceived low socioeconomic patients
lda_low = LdaModel(corpus_low, num_topics=num_topics, id2word=dictionary_un, passes=15)

# Topics for the perceived high socioeconomic patients
print("high Topics:")
for i, topic in lda_high.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

# Topics for the perceived low socioeconomic patients
print("\nlow Topics:")
for i, topic in lda_low.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

**t-SNE**

We install and import the necessary libraries:

In [None]:
!pip install pandas scikit-learn
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

We preprocess the text in the age_df dataframe, and then we vectorise it.

In [None]:
profesion_df['preprocessed_text'] = profesion_df['text_y'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(profesion_df['preprocessed_text'])

We apply T-SNE and plot it.

In [None]:
tsne = TSNE(n_components=2, random_state=42)

tsne_results = tsne.fit_transform(tfidf_matrix.toarray())

profesion_df['tsne_1'] = tsne_results[:, 0]
profesion_df['tsne_2'] = tsne_results[:, 1]

#plot
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(
    x='tsne_1', y='tsne_2',
    hue='Low qualification',
    palette=sns.color_palette("tab10", 2),
    data=profesion_df,
    legend="full",
    alpha=0.6
)

handles, labels = scatter.get_legend_handles_labels()
labels = ['High', 'Low']
scatter.legend(handles, labels, title="Perceived socioeconomic status")

plt.title('t-SNE visualization of topics for the perceived high vs low socieconomic patients')
plt.show()