We upload the data for the protected variable Gender got during the preprocessing as a dataframe. The names of the columns are: name, text, gender.

**Uploading the data**

In [None]:
import pandas as pd
gender_df = pd.read_excel('/content/ex_gender.xlsx')
gender_df

We create two dataframes, one for women and another for men.

In [None]:
women = gender_df[gender_df['gender'] == 1]
men = gender_df[gender_df['gender'] == 0]

## Descriptive statistics

We calculate the length of the text of women's notes.

In [None]:
import matplotlib.pyplot as plt

women['text_length'] = women['text'].str.len()
women['text_length'].hist()
plt.title('Histogram of Text Lengths for women')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')

print("Average length:", women['text_length'].mean())
print("Median length:", women['text_length'].median())

We count the average and media words in women's notes.

In [None]:
women['word_count'] = women['text'].apply(lambda x: len(str(x).split()))
women['word_count'].hist()

plt.title('Histogram word count for women')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

print("Average length:", women['word_count'].mean())
print("Median length:", women['word_count'].median())

We calculate the length of the text for men's notes.

In [None]:
men['text_length'] = men['text'].str.len()

men['text_length'].hist()
plt.title('Histogram of Text Lengths for men')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')

print("Average length:", men['text_length'].mean())
print("Median length:", men['text_length'].median())

We count the average and median words in men's notes.

In [None]:
men['word_count'] = men['text'].apply(lambda x: len(str(x).split()))
men['word_count'].hist()

plt.title('Histogram word count for men')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')

print("Average length:", men['word_count'].mean())
print("Median length:", men['word_count'].median())

##Sentiment analysis





**1. Pretrained BERT Pysentimiento**

We install and import the necessary libraries.

In [None]:
!pip install pysentimiento

from pysentimiento import create_analyzer
import transformers

transformers.logging.set_verbosity(transformers.logging.ERROR)

analyzer = create_analyzer(task="sentiment", lang="es")

We apply RoBERTuito for women and men's notes, respectively.

In [None]:
###Sentiment in texts for women:
positive_count = 0
neutral_count = 0
negative_count = 0

for index, row in women.iterrows():
    text = row['text']
    result_women = analyzer.predict(text)

    if result_women.output == 'POS':
        positive_count += 1
    elif result_women.output == 'NEU':
        neutral_count += 1
    elif result_women.output == 'NEG':
        negative_count += 1

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

In [None]:
###Sentiment in texts for men:
positive_count = 0
neutral_count = 0
negative_count = 0

for index, row in men.iterrows():
    text = row['text']
    result_men = analyzer.predict(text)

    if result_men.output == 'POS':
        positive_count += 1
    elif result_men.output == 'NEU':
        neutral_count += 1
    elif result_men.output == 'NEG':
        negative_count += 1

print("Positive:", positive_count)
print("Neutral:", neutral_count)
print("Negative:", negative_count)

**2. AFINN with Spanish Lexicon**

We install and import the necessary libraries.

In [None]:
####AFINN with spanish Lexicon
!pip install textblob
from textblob import TextBlob, Word

!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
import os
stop_words = set(stopwords.words('spanish'))

Firstly, we initialise an empty dictionary to store FINN lexicon. Afterwards, we load the AFINN lexicon from a CSV file into a DataFrame. Only the first two columns are used. the, we iterate through each row of the DataFrame, extracting the word and its sentiment score.

In [None]:
afinn = {}

lexicon_df = pd.read_csv('/content/lexico_afinn.csv', header=0)
lexicon_df = lexicon_df.iloc[:, :2]

for index, row in lexicon_df.iterrows():
    word = row['palabra']
    sentiment_score_str = row['puntuacion']

    try:
        sentiment_score = float(sentiment_score_str)
        afinn[word] = sentiment_score
    except ValueError:
        print(f"Invalid sentiment score '{sentiment_score_str}' for word '{word}'. Skipping row.")

We define a function that calculates the sentiment of the text. First, convert the text to lowercase and split it into words. Then, calculate the sentiment score by summing the scores of individual words using the AFINN dictionary.

In [None]:
def calculate_sentiment(text):
    words = text.lower().split()
    sentiment_score = sum(afinn.get(word, 0) for word in words)
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

We also upload stopwords for Catalan and add them to those for Spanish.

In [None]:
catalan_stopwords = catalan_stopwords = [
    'de', 'es', 'i', 'a', 'o', 'un', 'una', 'unes', 'uns', 'un', 'tot',
    'també', 'altre', 'algun', 'alguna', 'alguns', 'algunes', 'ser', 'és',
    'soc', 'ets', 'som', 'estic', 'està', 'estem', 'esteu', 'estan', 'com',
    'en', 'per', 'perquè', 'per que', 'estat', 'estava', 'ans', 'abans',
    'éssent', 'ambdós', 'però', 'per', 'poder', 'potser', 'puc', 'podem',
    'podeu', 'poden', 'vaig', 'va', 'van', 'fer', 'faig', 'fa', 'fem',
    'feu', 'fan', 'cada', 'fi', 'inclòs', 'primer', 'des de', 'conseguir',
    'consegueixo', 'consigueix', 'consigueixes', 'conseguim', 'consigueixen',
    'anar', 'haver', 'tenir', 'tinc', 'te', 'tenim', 'teniu', 'tene', 'el',
    'la', 'les', 'els', 'seu', 'aquí', 'meu', 'teu', 'ells', 'elles', 'ens',
    'nosaltres', 'vosaltres', 'si', 'dins', 'sols', 'solament', 'saber',
    'saps', 'sap', 'sabem', 'sabeu', 'saben', 'últim', 'llarg', 'bastant',
    'fas', 'molts', 'aquells', 'aquelles', 'seus', 'llavors', 'sota', 'dalt',
    'ús', 'molt', 'era', 'eres', 'erem', 'eren', 'mode', 'bé', 'quant',
    'quan', 'on', 'mentre', 'qui', 'amb', 'entre', 'sense', 'jo', 'aquell'
]

stop_words.update(catalan_stopwords)

We define a function to preprocess text data. The text is converted to lower case, punctuation removed, tokenised using TextBlob, lemmatised, and stopwords removed.

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = TextBlob(text).words
    tokens = [word for word in tokens if word not in stop_words]
    lemmatized_tokens = [Word(word).lemmatize() for word in tokens]
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

women['preprocessed_text'] = women['text'].apply(preprocess_text)
men['preprocessed_text'] = men['text'].apply(preprocess_text)

We apply the calculate_sentiment function to the 'preprocessed_text' column of the two dataframes and store the sentiment labels in a new column 'sentiment'.

In [None]:
women['sentiment'] = women['preprocessed_text'].apply(lambda text: calculate_sentiment(text))
men['sentiment'] = men['preprocessed_text'].apply(lambda text: calculate_sentiment(text))

We count the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') for women's notes.

In [None]:
sentiment_counts_women = women['sentiment'].value_counts()
sentiment_counts_women

We count the number of occurrences of each sentiment label ('Positive', 'Negative', 'Neutral') for men's notes.

In [None]:
sentiment_counts_men = men['sentiment'].value_counts()
sentiment_counts_men

## Topic modelling and document embedding

**LDA**

We install and import the necessary libraries.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

We use LDA for topic modelling, following the following steps:

In [None]:
#Firstly we tokenize the text
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

women['tokenized_text'] = women['preprocessed_text'].apply(tokenize_text)
men['tokenized_text'] = men['preprocessed_text'].apply(tokenize_text)

#We create dictionary representation of the documents
dictionary_women = Dictionary(women['tokenized_text'])
dictionary_men = Dictionary(men['tokenized_text'])

dictionary_women.filter_extremes(no_below=4, no_above=0.4)
dictionary_men.filter_extremes(no_below=4, no_above=0.4)

#We convert the dictionary to a bag of words corpus
corpus_women = [dictionary_women.doc2bow(doc) for doc in women['tokenized_text']]
corpus_men = [dictionary_men.doc2bow(doc) for doc in men['tokenized_text']]

Now we apply the LDA model, and represent 2 topics for women and men's notes, respectively:

In [None]:
num_topics = 2

#LDA in women
lda_women = LdaModel(corpus_women, num_topics=num_topics, id2word=dictionary_women, passes=10)

#LDA in men
lda_men = LdaModel(corpus_men, num_topics=num_topics, id2word=dictionary_men, passes=10)

#Topics for women
print("Women Topics:")
for i, topic in lda_women.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

#Topics for men
print("Men Topics:")
for i, topic in lda_men.print_topics(num_words=10):
    print(f"Topic {i}: {topic}")

**t-SNE**

We install and import the necessary libraries:

In [None]:
!pip install pandas scikit-learn
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import seaborn as sns

We preprocess the text in the age_df dataframe, and then we vectorise it.

In [None]:
gender_df['preprocessed_text'] = gender_df['text'].apply(preprocess_text)

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(gender_df['preprocessed_text'])

We apply T-SNE and plot it.

In [None]:
tsne = TSNE(n_components=2, random_state=42)

tsne_results = tsne.fit_transform(tfidf_matrix.toarray())

gender_df['tsne_1'] = tsne_results[:, 0]
gender_df['tsne_2'] = tsne_results[:, 1]

#plot
plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(
    x='tsne_1', y='tsne_2',
    hue='gender',
    palette=sns.color_palette("hls", 2),
    data=gender_df,
    legend="full",
    alpha=0.6
)

handles, labels = scatter.get_legend_handles_labels()
labels = ['Women', 'Men']
scatter.legend(handles, labels, title="Gender")

plt.title('t-SNE visualization of topics for each gender')
plt.show()

# Logistic regression

We intend to predict if the patient category 'Women' or 'Men', from the text. Firstly, we import the necessary libraries.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

Then, we follow the next steps to train the logistic regression model, make predictions, and get a confusion matrix and accuracy score.

In [None]:
#Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(gender_df['text'], gender_df['gender'], test_size=0.2, random_state=42)

#Preprocess text and convert to numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#Train logistic regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tfidf, y_train)

#Make predictions on the test set
predictions = logreg_model.predict(X_test_tfidf)

#Evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

#Get the onfusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
true_positives = conf_matrix[1, 1]

print("Confusion Matrix:")
print(conf_matrix)
print("True Positives:", true_positives)

Then we build a classification report, and we get the accuracy for the groups women patients and men patients.

In [None]:
class_report = classification_report(y_test, predictions, target_names=['Men', 'Women'])
print("Classification Report:")
print(class_report)

#Extract values from the confusion matrix
TN = conf_matrix[0, 0]  #True Negatives for men
FP = conf_matrix[0, 1]  #False Positives for men (incorrectly predicted as women)
FN = conf_matrix[1, 0]  #False Negatives for women (incorrectly predicted as men)
TP = conf_matrix[1, 1]  #True Positives for women

#Calculate accuracy for each group
accuracy_men = TN / (TN + FP)
accuracy_women = TP / (TP + FN)

print(f'Accuracy for texts from men: {accuracy_men}')
print(f'Accuracy for texts from women: {accuracy_women}')