We upload the data for the protected variable Age got during the preprocessing as a dataframe. The names of the columns are: name, text_y, age, older.

In [None]:
import pandas as pd
age_df = pd.read_excel('/content/ex_age.xlsx')
age_df

We create two dataframes, one for older than 60 years old patients ('older') and another one for patients younger than 60 years old ('younger').

In [None]:
older = age_df[age_df['older'] == 1]
younger = age_df[age_df['older'] == 0]

## Descriptive statistics

We calculate the length of the text of older patients' notes.

In [None]:
import matplotlib.pyplot as plt

older['text_length'] = older['text_y'].str.len()
older['text_length'].hist()
plt.title('Histogram of Text Lengths for older patients')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')

print("Average length:", older['text_length'].mean())
print("Median length:", older['text_length'].median())

We count the words of the older patients' notes.

In [None]:
older['word_count'] = older['text_y'].apply(lambda x: len(str(x).split()))
older['word_count'].hist()

plt.title('Histogram word count for older patients')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

print("Average length:", older['word_count'].mean())
print("Median length:", older['word_count'].median())

We calculate the length of the text for younger patients.

In [None]:
younger['text_length'] = younger['text_y'].str.len()

younger['text_length'].hist()
plt.title('Histogram of Text Lengths for younger patients')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')

print("Average length:", younger['text_length'].mean())
print("Median length:", younger['text_length'].median())

We count the words of the younger patients' notes.

In [None]:
younger['word_count'] = younger['text_y'].apply(lambda x: len(str(x).split()))
younger['word_count'].hist()

plt.title('Histogram word count for younger patients')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

print("Average length:", younger['word_count'].mean())
print("Median length:", younger['word_count'].median())

##Sentiment analysis





**1. Pretrained BERT Pysentimiento**

We install and import the necessary libraries.

In [None]:
!pip install pysentimiento

from pysentimiento import create_analyzer
import transformers

transformers.logging.set_verbosity(transformers.logging.ERROR)

analyzer = create_analyzer(task="sentiment", lang="es")

We apply RoBERTuito for the older and younger patients' notes, respectively.

In [None]:
###Sentiment in texts for older:
positive_count = 0
neutral_count = 0
negative_count = 0

for index, row in older.iterrows():
    text = row['text_y']
    result_older = analyzer.predict(text)

    if result_older.output == 'POS':
        positive_count += 1
    elif result_older.output == 'NEU':
        neutral_count += 1
    elif result_older.output == 'NEG':
        negative_count += 1

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

In [None]:
###Sentiment in texts for younger:
positive_count = 0
neutral_count = 0
negative_count = 0

for index, row in younger.iterrows():
    text = row['text_y']
    result_younger = analyzer.predict(text)

    if result_younger.output == 'POS':
        positive_count += 1
    elif result_younger.output == 'NEU':
        neutral_count += 1
    elif result_younger.output == 'NEG':
        negative_count += 1

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

**2. AFINN with Spanish Lexicon**

We install and import the necessary libraries.

In [None]:
!pip install textblob
from textblob import TextBlob, Word

!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
import os
stop_words = set(stopwords.words('spanish'))

Firstly, we initialise an empty dictionary to store FINN lexicon. Afterwards, we load the AFINN lexicon from a CSV file into a DataFrame. Only the first two columns are used. the, we iterate through each row of the DataFrame, extracting the word and its sentiment score.

In [None]:
afinn = {}

lexicon_df = pd.read_csv('/content/lexico_afinn.csv', header=0)
lexicon_df = lexicon_df.iloc[:, :2]

for index, row in lexicon_df.iterrows():
    word = row['palabra']
    sentiment_score_str = row['puntuacion']

    try:
        sentiment_score = float(sentiment_score_str)
        afinn[word] = sentiment_score
    except ValueError:
        print(f"Invalid sentiment score '{sentiment_score_str}' for word '{word}'. Skipping row.")

We define a function that calculates the sentiment of the text. First, convert the text to lowercase and split it into words. Then, calculate the sentiment score by summing the scores of individual words using the AFINN dictionary.

In [None]:
def calculate_sentiment(text):
    words = text.lower().split()
    sentiment_score = sum(afinn.get(word, 0) for word in words)
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

We also upload stopwords for Catalan and add them to those for Spanish.

In [None]:
catalan_stopwords = catalan_stopwords = [
    'de', 'es', 'i', 'a', 'o', 'un', 'una', 'unes', 'uns', 'un', 'tot',
    'també', 'altre', 'algun', 'alguna', 'alguns', 'algunes', 'ser', 'és',
    'soc', 'ets', 'som', 'estic', 'està', 'estem', 'esteu', 'estan', 'com',
    'en', 'per', 'perquè', 'per que', 'estat', 'estava', 'ans', 'abans',
    'éssent', 'ambdós', 'però', 'per', 'poder', 'potser', 'puc', 'podem',
    'podeu', 'poden', 'vaig', 'va', 'van', 'fer', 'faig', 'fa', 'fem',
    'feu', 'fan', 'cada', 'fi', 'inclòs', 'primer', 'des de', 'conseguir',
    'consegueixo', 'consigueix', 'consigueixes', 'conseguim', 'consigueixen',
    'anar', 'haver', 'tenir', 'tinc', 'te', 'tenim', 'teniu', 'tene', 'el',
    'la', 'les', 'els', 'seu', 'aquí', 'meu', 'teu', 'ells', 'elles', 'ens',
    'nosaltres', 'vosaltres', 'si', 'dins', 'sols', 'solament', 'saber',
    'saps', 'sap', 'sabem', 'sabeu', 'saben', 'últim', 'llarg', 'bastant',
    'fas', 'molts', 'aquells', 'aquelles', 'seus', 'llavors', 'sota', 'dalt',
    'ús', 'molt', 'era', 'eres', 'erem', 'eren', 'mode', 'bé', 'quant',
    'quan', 'on', 'mentre', 'qui', 'amb', 'entre', 'sense', 'jo', 'aquell'
]

stop_words.update(catalan_stopwords)

We define a function to preprocess text data. The text is converted to lower case, punctuation removed, tokenised using TextBlob, lemmatised, and stopwords removed.

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = TextBlob(text).words
    tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [Word(word).lemmatize() for word in tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

older['preprocessed_text'] = older['text_y'].apply(preprocess_text)
younger['preprocessed_text'] = younger['text_y'].apply(preprocess_text)

We apply the calculate_sentiment function to the 'preprocessed_text' column of the two dataframes and store the sentiment labels in a new column 'sentiment'.

In [None]:
older['sentiment'] = older['preprocessed_text'].apply(lambda text: calculate_sentiment(text))
younger['sentiment'] = younger['preprocessed_text'].apply(lambda text: calculate_sentiment(text))

We count the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') for older patients' notes.


In [None]:
sentiment_counts_older = older['sentiment'].value_counts()
sentiment_counts_older

We count the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') for younger patients' notes.

In [None]:
sentiment_counts_younger = younger['sentiment'].value_counts()
sentiment_counts_younger

## Topic modelling and document embedding

**LDA**

We install and import the necessary libraries.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

We use LDA for topic modelling, following the following steps:

In [None]:
#Firstly we tokenize the text:
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

#We apply the function to the preprocessed text:
older['tokenized_text'] = older['preprocessed_text'].apply(tokenize_text)
younger['tokenized_text'] = younger['preprocessed_text'].apply(tokenize_text)

#We create dictionary representation of the documents
dictionary = Dictionary(older['tokenized_text'])
dictionary_non = Dictionary(younger['tokenized_text'])

dictionary.filter_extremes(no_below=4, no_above=0.4)
dictionary_non.filter_extremes(no_below=4, no_above=0.4)

# Convert the dictionary to a bag of words corpus
corpus_older = [dictionary.doc2bow(doc) for doc in older['tokenized_text']]
corpus_younger = [dictionary_non.doc2bow(doc) for doc in younger['tokenized_text']]

Now we apply the LDA model, and represent 2 topics for older and younger patients' notes, respectively:

In [None]:
num_topics = 2

#LDA in older patients
lda_older = LdaModel(corpus_older, num_topics=num_topics, id2word=dictionary, passes=10)

#LDA in younger patients
lda_younger = LdaModel(corpus_younger, num_topics=num_topics, id2word=dictionary_non, passes=10)

#Topics for the older patients
print("Older Topics:")
for i, topic in lda_older.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

#Topics for the younger patients
print("Younger Topics:")
for i, topic in lda_younger.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

**t-SNE**

We install and import the necessary libraries:

In [None]:
!pip install pandas scikit-learn
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

We preprocess the text in the age_df dataframe, and then we vectorise it.

In [None]:
age_df['preprocessed_text'] = age_df['text_y'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(age_df['preprocessed_text'])

We apply T-SNE and plot it.

In [None]:
tsne = TSNE(n_components=2, random_state=42)

tsne_results = tsne.fit_transform(tfidf_matrix.toarray())

age_df['tsne_1'] = tsne_results[:, 0]
age_df['tsne_2'] = tsne_results[:, 1]

#plot
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(
    x='tsne_1', y='tsne_2',
    hue='older',
    palette=sns.color_palette("rocket", 2),
    data=age_df,
    legend="full",
    alpha=0.6
)

handles, labels = scatter.get_legend_handles_labels()
labels = ['Younger', 'Older']
scatter.legend(handles, labels, title="Age")

plt.title('t-SNE visualization of topics for each age group')
plt.show()

# Logistic regression

We intend to predict if the patient category older or younger, from the text. Firstly, we import the necessary libraries.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Then, we follow the next steps to train the logistic regression model, make predictions, and get a confusion matrix and accuracy score.

In [None]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(age_df['text_y'], age_df['older'], test_size=0.2, random_state=42)

#Preprocess text and convert to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#Train logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, y_train)

#Make predictions on the test set
predictions = logreg_model.predict(X_test_tfidf)

#Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

#confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
true_positives = conf_matrix[1, 1]

print("Confusion Matrix:")
print(conf_matrix)
print("True Positives:", true_positives)

Then we build a classification report, and we get the accuracy for the groups older patients and younger patients.

In [None]:
class_report = classification_report(y_test, predictions, target_names=['Younger', 'Older'])
print("Classification Report:")
print(class_report)

# Extract values from the confusion matrix
TN = conf_matrix[0, 0]
FP = conf_matrix[0, 1]
FN = conf_matrix[1, 0]
TP = conf_matrix[1, 1]

# Calculate accuracy for each group
accuracy_younger = TN / (TN + FP)
accuracy_older = TP / (TP + FN)

print(f'Accuracy for texts from younger: {accuracy_younger}')
print(f'Accuracy for texts from older: {accuracy_older}')