# Autor: Cristóbal Arroyo

# Imports and setups

In [56]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = nltk.corpus.stopwords.words('spanish')

wpt = nltk.WordPunctTokenizer()
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
df_original = pd.read_excel('./ODScat_345.xlsx')
df_original.columns

Index(['Textos_espanol', 'sdg'], dtype='object')

In [59]:
# Check null data
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4049 entries, 0 to 4048
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Textos_espanol  4049 non-null   object
 1   sdg             4049 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 63.4+ KB


# Data preparation and normalization

In [60]:
def fix_encoding(text):
    replacements = {
        'Ã¡': 'a',
        'Ã©': 'e',
        'Ã­': 'i',
        'Ã³': 'o',
        'Ãº': 'u',
        'Ã±': 'ñ'
    }

    for wrong, correct in replacements.items():
        text = text.replace(wrong, correct)

    return text

df_original['Textos_espanol'] = df_original['Textos_espanol'].apply(fix_encoding)

In [61]:
corpus = np.array(df_original['Textos_espanol'])

In [62]:
def normalize_documents(doc):
    # Remove lower case and special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()

    tokens = wpt.tokenize(doc)
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)

    return doc

normalize_corpus = np.vectorize(normalize_documents)
norm_corpus = normalize_corpus(corpus)

In [63]:
df_cleaned = df_original.copy()
df_cleaned['Clean_Text'] = norm_corpus

# Model

In [64]:
from sklearn.model_selection import train_test_split

x = np.array(df_cleaned['Clean_Text'])
y = np.array(df_cleaned['sdg'])

In [66]:
# Insert errors in y to reduce bias
possibles_sdgs = [3, 4, 5]
def insert_errors(original_sdg, n_replacements, verbose=False):
  random_indices = np.random.choice(y.size, n_replacements, replace=False)

  for i in random_indices:
    old_value = y[i]
    y[i] = np.random.choice(possibles_sdgs)
    while y[i] == old_value:
      y[i] = np.random.choice(possibles_sdgs)
    print(f"y[{i:>4}]: {old_value:}->{y[i]}") if verbose else None

insert_errors(y, 10, verbose=True)

y[1013]: 4->3
y[1286]: 4->5
y[2433]: 5->4
y[2769]: 5->3
y[3360]: 4->5
y[2632]: 5->3
y[1565]: 4->5
y[ 126]: 3->4
y[3110]: 5->3
y[3614]: 3->5


array([3, 3, 3, ..., 5, 5, 5])

In [10]:
# Split test and train data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3036,), (1013,), (3036,), (1013,))

In [11]:
from collections import Counter

train_dict = dict(Counter(y_train))
test_dict = dict(Counter(y_test))

# Show labels count for train and test
pd.DataFrame({'Train': train_dict, 'Test': test_dict})

Unnamed: 0,Train,Test
4,1022,332
5,1083,368
3,931,313


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

cv = CountVectorizer(min_df=0., max_df=1.)
x_train_features = cv.fit_transform(x_train)
x_test_features = cv.transform(x_test)
x_train_features.shape, x_test_features.shape

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

mnb = MultinomialNB(alpha=0.1)
mnb.fit(x_train_features, y_train)

y_pred = mnb.predict(x_test_features)

mnb_cv_scores = cross_val_score(mnb, x_train_features, y_train, cv=5)
mnb_cv_mean_score = np.mean(mnb_cv_scores)
print('CV Accuracy (5-fold):', mnb_cv_scores)
print('Mean CV Accuracy:', mnb_cv_mean_score)

mnb_test_score = mnb.score(x_test_features, y_test)
print('Test Accuracy:', mnb_test_score)

print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(metrics.classification_report(y_test, y_pred))

CV Accuracy (5-fold): [0.96710526 0.96869852 0.96046129 0.97199341 0.9752883 ]
Mean CV Accuracy: 0.9687093557617272
Test Accuracy: 0.9693978282329714
Accuracy: 0.9693978282329714
              precision    recall  f1-score   support

           3       0.99      0.95      0.97       313
           4       0.99      0.97      0.98       332
           5       0.94      0.99      0.96       368

    accuracy                           0.97      1013
   macro avg       0.97      0.97      0.97      1013
weighted avg       0.97      0.97      0.97      1013



# Model Tuning Using Pipeline

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

mnb_pipeline = Pipeline([
    ('cv', CountVectorizer()),
    ('mnb', MultinomialNB())
])

param_grid = {
    'cv__min_df': [0.0, 0.1, 0.2],
    'cv__max_df': [1.0, 0.9, 0.8],
    'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]
}

gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
gs_mnb = gs_mnb.fit(x_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


KeyboardInterrupt: 

In [None]:
gs_mnb.best_estimator_.get_params()

In [None]:
cv_results = gs_mnb.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                           'params': cv_results['params'],
                           'cv score (mean)': cv_results['mean_test_score'],
                           'cv score (std)': cv_results['std_test_score']})
results_df = results_df.sort_values(by='rank').head()
pd.set_option('display.max_colwidth', 100)
results_df

In [None]:
best_mnb_test_score = gs_mnb.score(x_test, y_test)
print('Test Accuracy :', best_mnb_test_score)

# Construcción del Pipeline de predicción

In [70]:
from sklearn.preprocessing import FunctionTransformer

def fix_document_encoding(doc):
    replacements = {
        'Ã¡': 'a',
        'Ã©': 'e',
        'Ã­': 'i',
        'Ã³': 'o',
        'Ãº': 'u',
        'Ã±': 'ñ'
    }

    for wrong, correct in replacements.items():
        doc = doc.replace(wrong, correct)
    return doc

def normalize_documents(doc):
    # Remove lower case and special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc)
    doc = doc.lower()
    doc = doc.strip()

    tokens = wpt.tokenize(doc)
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)

    return doc

fix_encoding = np.vectorize(fix_document_encoding)
normalize_corpus = np.vectorize(normalize_documents)

preprocessing_pipeline = Pipeline([
    ('fix_encoding', FunctionTransformer(fix_encoding)),
    ('normalize_corpus', FunctionTransformer(normalize_corpus)),
])

mnb_pipeline = Pipeline([
    ('cv', CountVectorizer(min_df=0., max_df=1.)),
    ('mnb', MultinomialNB())
])

pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('clf', mnb_pipeline)
])


## Prueba del pipeline con train-test split

In [71]:
df_test = pd.read_excel('./ODScat_345.xlsx')

In [75]:
x_train, x_test, y_train, y_test = train_test_split(df_test['Textos_espanol'], df_test['sdg'], test_size=0.25, random_state=1)

# This can't be done in a pipeline as it's not sklearn compliant
possibles_sdgs = [3, 4, 5]
def insert_errors(original_sdg, n_replacements, verbose=False):
  random_indices = np.random.choice(y.size, n_replacements, replace=False)

  for i in random_indices:
    old_value = y[i]
    y[i] = np.random.choice(possibles_sdgs)
    while y[i] == old_value:
      y[i] = np.random.choice(possibles_sdgs)
    print(f"y[{i:>4}]: {old_value:}->{y[i]}") if verbose else None
insert_errors(y_train, 100, verbose=False)

pipeline_model = pipeline.fit(x_train, y_train)

y_pred = pipeline_model.predict(x_test)

# 0.9733464955577492
# 0.9733464955577492

print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9733464955577492
              precision    recall  f1-score   support

           3       0.99      0.95      0.97       313
           4       0.99      0.97      0.98       332
           5       0.95      0.99      0.97       368

    accuracy                           0.97      1013
   macro avg       0.98      0.97      0.97      1013
weighted avg       0.97      0.97      0.97      1013



In [47]:
def get_salient_words(nb_clf, vect, class_ind):
    words = vect.get_feature_names_out()
    zipped = list(zip(words, nb_clf.feature_log_prob_[class_ind]))
    sorted_zip = sorted(zipped, key=lambda t: t[1], reverse=True)

    return sorted_zip

class_1 = get_salient_words(pipeline_model.named_steps['clf'].named_steps['mnb'], pipeline_model.named_steps['clf'].named_steps['cv'], 0)[:20]
class_2 = get_salient_words(pipeline_model.named_steps['clf'].named_steps['mnb'], pipeline_model.named_steps['clf'].named_steps['cv'], 1)[:20]
class_3 = get_salient_words(pipeline_model.named_steps['clf'].named_steps['mnb'], pipeline_model.named_steps['clf'].named_steps['cv'], 2)[:20]


print(class_1)
print(class_2)
print(class_3)

[('salud', -4.07722680909881), ('atencin', -4.36549078438924), ('ms', -4.670789284229319), ('servicio', -4.842738282207651), ('pacient', -5.283529259207556), ('pase', -5.295259598993045), ('sistema', -5.298213810890476), ('mental', -5.343607383150839), ('enfermedad', -5.417477231717294), ('ao', -5.420816132982808), ('tambin', -5.465267895553643), ('mdico', -5.4972422001861565), ('primaria', -5.572183621478275), ('persona', -5.5958067672417116), ('calidad', -5.620001495828768), ('mayor', -5.705159304169075), ('ocd', -5.705159304169075), ('nivel', -5.778925454408684), ('tasa', -5.788540913108126), ('tratamiento', -5.853401547310167)]
[('educacin', -4.361143423757457), ('escuela', -4.4542338468234695), ('estudiant', -4.552842142364009), ('ms', -4.743543258934626), ('docent', -5.043240768130818), ('escolar', -5.07222830500407), ('aprendizaj', -5.109098840812397), ('programa', -5.280215798593489), ('nivel', -5.331655709798251), ('sistema', -5.340492291598749), ('ao', -5.370524578697624), ('

In [48]:
# Put classes in a dataframe

pd.DataFrame({'ODS_3': class_2, 'ODS_4': class_3, 'ODS_5': class_1})

Unnamed: 0,ODS_3,ODS_4,ODS_5
0,"(educacin, -4.361143423757457)","(mujer, -3.4908966616291117)","(salud, -4.07722680909881)"
1,"(escuela, -4.4542338468234695)","(gnero, -4.2177572974720166)","(atencin, -4.36549078438924)"
2,"(estudiant, -4.552842142364009)","(ms, -4.756753798204704)","(ms, -4.670789284229319)"
3,"(ms, -4.743543258934626)","(hombr, -4.883505503843848)","(servicio, -4.842738282207651)"
4,"(docent, -5.043240768130818)","(poltica, -5.200904048624106)","(pacient, -5.283529259207556)"
5,"(escolar, -5.07222830500407)","(pase, -5.212572792458483)","(pase, -5.295259598993045)"
6,"(aprendizaj, -5.109098840812397)","(trabajo, -5.23632687846659)","(sistema, -5.298213810890476)"
7,"(programa, -5.280215798593489)","(igualdad, -5.255744964323692)","(mental, -5.343607383150839)"
8,"(nivel, -5.331655709798251)","(derecho, -5.263125071621315)","(enfermedad, -5.417477231717294)"
9,"(sistema, -5.340492291598749)","(tambin, -5.449900978764649)","(ao, -5.420816132982808)"


# Generación de resultados de validación con el pipeline

In [51]:
df_test = pd.read_excel('./TestODScat_345.xlsx')
x_data = df_test['Textos_espanol']
y_pred_test = pipeline_model.predict(x_data)

In [52]:
df_test['sdg'] = y_pred_test
df_test.to_excel('./TestODScat_345_pred.xlsx', index=False)
df_test.to_csv('./TestODScat_345_pred.csv', index=False)

# Using LDA Topic Extraction

In [None]:
# USING LDA TOPIC EXTRACTION
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
vocab = cv.get_feature_names_out()

matrix = pd.DataFrame(cv_matrix, columns=vocab)

lda = LatentDirichletAllocation(n_components = 3, max_iter = 200, random_state=0)
dt_matrix = lda.fit_transform(cv_matrix)

features = pd.DataFrame(dt_matrix, columns=['ODS_5', 'ODS_3', 'ODS_4'])

tt_matrix = lda.components_
for topic_weights in tt_matrix:
  topic = [(token, weight) for token, weight in zip(vocab, topic_weights)]
  topic = sorted(topic, key=lambda x: -x[1])
  topic = [item for item in topic if item[1]>0.6]
  print(topic)

features


[('mujer', 3341.3002985032167), ('gnero', 1527.3182598154638), ('ms', 858.6691692990046), ('hombr', 842.6553964322404), ('poltica', 637.7921525397867), ('derecho', 589.1598374263976), ('igualdad', 579.3118425449144), ('trabajo', 569.9359671564328), ('pase', 558.5153506689833), ('tambin', 439.43544410001067), ('social', 435.90683266119186), ('violencia', 356.3202083807385), ('pueden', 345.8312420947833), ('labor', 338.3766652788401), ('nia', 312.26899458175694), ('pued', 305.5350567166894), ('tiempo', 299.65024603622254), ('mayor', 298.6996474887284), ('participacin', 297.2689465945091), ('desarrollo', 277.9751079231333), ('ejemplo', 277.67499109914513), ('nivel', 274.54679093062305), ('empleo', 264.2362504921913), ('part', 254.6800123617606), ('brecha', 246.46738789772652), ('meno', 242.57619782212686), ('embargo', 238.16423079227317), ('gobierno', 237.6413890751804), ('acceso', 236.36080577924935), ('ser', 232.57606599525377), ('vida', 232.21815290305022), ('hogar', 228.89458005543483

Unnamed: 0,ODS_5,ODS_3,ODS_4
0,0.005026,0.990324,0.004650
1,0.009275,0.981649,0.009076
2,0.440558,0.491165,0.068278
3,0.004190,0.845256,0.150555
4,0.127595,0.867453,0.004951
...,...,...,...
4044,0.988785,0.005619,0.005596
4045,0.982761,0.008437,0.008803
4046,0.979920,0.009945,0.010135
4047,0.990631,0.004624,0.004745


In [None]:
# LDA Results per row
df_lda = pd.concat([df_original, features], axis=1)

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

y = df_lda['sdg']
x = df_lda.drop(['Textos_espanol', 'sdg'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

gnb = GaussianNB()
gnb.fit(x_train, y_train)

y_pred = gnb.predict(x_test)

from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.9543209876543209
              precision    recall  f1-score   support

           3       0.98      0.93      0.95       254
           4       0.96      0.96      0.96       265
           5       0.93      0.97      0.95       291

    accuracy                           0.95       810
   macro avg       0.96      0.95      0.95       810
weighted avg       0.95      0.95      0.95       810



In [None]:
df_original2 = pd.concat([df_original, features], axis=1)
df_topics = df_original2.drop(['Textos_espanol', 'sdg'], axis=1)

def check_category(row):
  ods = (row['ODS_5'], row['ODS_3'], row['ODS_4'])
  index_max = ods.index(max(ods))
  if index_max == 0:
    return 5
  elif index_max == 1:
    return 3
  else:
    return 4

y_pred = df_original2.apply(check_category, axis=1)
df_original2['sdg_pred'] = y_pred
df_original2["correcto"] = df_original2["sdg"] == df_original2["sdg_pred"]
df_errados = df_original2.loc[df_original2["correcto"] == False]
df_errados.to_csv('ODScat_345_LDA_Errores.csv', index=True)
df_errados

Unnamed: 0,Textos_espanol,sdg,ODS_5,ODS_3,ODS_4,sdg_pred,correcto
56,"Para contribuir a este objetivo, en 2011 se creó la Red Conjunta de Funcionarios de Sanidad y Pr...",3,0.562092,0.310253,0.127655,5,False
82,"Por lo tanto, existen muchos ejemplos documentados de cómo diferentes países de la Commonwealth ...",3,0.745073,0.247231,0.007696,5,False
89,Estos requisitos fueron tomados de la Decisión del Párrafo 6 (ver arriba). Documento de trabajo ...,3,0.582652,0.409946,0.007401,5,False
142,Sport for Health tiene como objetivo reducir la carga de enfermedades no transmisibles en Tonga....,3,0.691101,0.305395,0.003503,5,False
148,Este documento concluye con métodos para acelerar el despliegue de normas en la sanidad pública ...,3,0.567246,0.422907,0.009847,5,False
...,...,...,...,...,...,...,...
4016,El análisis de género es un requisito del proceso obligatorio de aseguramiento de la calidad del...,5,0.268264,0.012980,0.718756,4,False
4027,Las consideraciones importantes al desarrollar un cuestionario sobre la violencia contra la muje...,5,0.434342,0.005472,0.560187,4,False
4033,Los miembros de la red comparten mejores prácticas y brindan servicios financieros a mujeres emp...,5,0.407347,0.582085,0.010567,3,False
4038,Esta nueva ley descalificó al 95 por ciento de las 630 mujeres capacitadas por ZNWL para el lide...,5,0.427041,0.008951,0.564008,4,False
