# Thesis Project

- Author 💻✍: Diego Acosta Ugalde
- Description 🖇: This notebook contains all the implementations of my thesis project. The structure can be shown in the Table of Contents.
- Date of creation 📅: 8/Feb/2024

----

MODULES

    1 -> Preprocessing

    2 -> Initial Data Exploration
    
    3 -> Text Representations

    4 -> Topic Recognizion

    5 -> ML Classifiers

    6 -> DL Classifiers

    7 -> Tranformers

## 1: Preprocessing of the raw ECOA data base

In [None]:
# Dependecies
# from google.colab import drive
# from google.colab import files
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel
import graphviz

import matplotlib.pyplot as plt

import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer

import pandas as pd
import pydotplus
import plotly.graph_objects as go


import stanza
from stanza.models.common.doc import Document
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.manifold import TSNE
from sklearn import tree
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoModelForMaskedLM

from wordcloud import WordCloud

In [None]:
# Columns that we want to use
columns = ['MET', 'PRA', 'ASE', 'EVA', 'RET', 'APR', 'DOM', 'REC', 'MEJ', 'GÉNERO ALUMNO', 'PROM ACUMULADO EN PROFESIONAL', 'Género del profesor', 'Por qué no lo recomendarias', 'Comentarios']
aspects = ['MET', 'PRA', 'ASE', 'EVA', 'RET', 'APR', 'DOM', 'REC', 'MEJ']

In [None]:
# drive.mount('/content/drive')
# df=pd.read_excel('drive/MyDrive/Tesis/ecoa-demografico.xlsx')
df = pd.read_excel('ecoa-demografico.xlsx')

In [None]:
df.head()

In [None]:
print(f'Our original dataset is of shape: {df.shape}')

In [None]:
df_ecoas = df[columns]
df_ecoas.head()

In [None]:
pd.options.mode.chained_assignment = None # Disable warning
df_ecoas = df_ecoas.dropna(subset=['Por qué no lo recomendarias', 'Comentarios'], thresh=1) # Delete all rows where there are neither negative or positive comments
df_ecoas['GÉNERO ALUMNO'] = np.where((df_ecoas['GÉNERO ALUMNO'] == 'Femenino'), 0, df_ecoas['GÉNERO ALUMNO'])
df_ecoas['GÉNERO ALUMNO'] = np.where((df_ecoas['GÉNERO ALUMNO'] == 'Masculino'), 1, df_ecoas['GÉNERO ALUMNO'])
df_ecoas['Género del profesor'] = np.where((df_ecoas['Género del profesor'] == 'Femenino'), 0, df_ecoas['Género del profesor'])
df_ecoas['Género del profesor'] = np.where((df_ecoas['Género del profesor'] == 'Masculino'), 1, df_ecoas['Género del profesor'])

In [None]:
for asp in aspects: # We delete all incomplete submissions
  df_ecoas.loc[df_ecoas[asp].isnull(),asp] = 99
  df_ecoas = df_ecoas.loc[df_ecoas[asp] != 99]
df_ecoas = df_ecoas.reset_index(drop=True) # Reset the index

In [None]:
df_ecoas= df_ecoas.rename(columns={"Comentarios": "Positivos"})
df_ecoas= df_ecoas.rename(columns={"Por qué no lo recomendarias": "Negativos"})
p_only = df_ecoas['Positivos'].count()
n_only = df_ecoas['Negativos'].count()
print(f'Tenemos { p_only } comentarios SOLO positivos')
print(f'Tenemos { n_only } comentarios SOLO negativos')

In [None]:
df_ecoas['Tipo Comentario'] = 0
df_ecoas.loc[pd.notna(df_ecoas['Positivos']), 'Tipo Comentario'] = 1
df_ecoas.loc[pd.notna(df_ecoas['Negativos']) & pd.notna(df_ecoas['Positivos']), 'Tipo Comentario'] = 2


In [None]:
df_ecoas['Negativos'] = df_ecoas['Negativos'].fillna('')
df_ecoas['Positivos'] = df_ecoas['Positivos'].fillna('')
df_ecoas["Comentarios"] = df_ecoas["Positivos"] + ' ' + df_ecoas["Negativos"]
df_ecoas = df_ecoas.drop(columns=['Negativos', 'Positivos'])

In [None]:
df_ecoas['AVG'] = df_ecoas[['APR', 'EVA', 'MET', 'PRA', 'REC', 'RET']].mean(axis=1).round(1)

In [None]:
# This is the data set after initial cleaning
df_ecoas.head()

### Lemmatization
*In this step we performed some NLP preprocessing techquines such as stop words removal, symbols and special characters removal and lemmatization of the entire comments column*

In [None]:
comments = df_ecoas['Comentarios']

for a in range(len(comments)):
  if type(comments[a]) is not str:
    print(type(comments[a]), 'No es str: ', a)
    df_ecoas.drop(index=a)

comments = comments.str.lower()

nltk.download('stopwords')
stop = stopwords.words('spanish')
stop.remove('no')
stop.append('es')

# Tokenize the comments
for i in range(len(comments)):
  comments[i] = str(comments[i]).split()
  comments[i] = [word for word in comments[i] if word not in stop]

# Remove the symbols and replace them with blank spaces
sim = "!\"#$%&()*+-.,/:;<=>?@[\]^_`{|}~\n"

# Para cada símbolo, se remplaza por blank space
for c in range(len(comments)):
  for i in range(len(comments[c])):
    for j in sim:
      if comments[c][i].__contains__(j):
        comments[c][i] = comments[c][i].replace(j,'')

comments

In [None]:
total_less2  = 0
total_si = 0
total_no = 0

for comm in comments:
  if len(comm) <2:
    total_less2 += 1
    if 'si' in comm:
      total_si += 1
    if 'no' in comm:
      total_no += 1

print(f'Tenemos {total_less2} comentarios con una sola palabra, sin embargo, no los eliminaremos porque muchos son palabras que pueden ayudar al modelo')
print(f'Tenemos {total_si} comentarios con la palabra "si"')
print(f'Tenemos {total_no} comentarios con la palabra "no"')

In [None]:
comments = list(comments)
for i in range(len(comments)):
  while '' in comments[i]:
    comments[i].remove('')

comentarios_merge=[None]*len(comments)

for i in range(len(comentarios_merge)):
  comentarios_merge[i]=(TreebankWordDetokenizer().detokenize(comments[i]))

print('Example of comment:')
comentarios_merge[2]

In [None]:
comments_lemm = []

# Config the Stanza pipeline
config = {
	'processors': 'tokenize,mwt,pos,lemma',
  'lang': 'es',
}
nlp = stanza.Pipeline(**config) # Initialize the pipeline using a configuration dict

In [None]:
import concurrent.futures

com_lemmatized = []
comments_lemm = []

def lemmatize_comments(comment):
  doc = nlp(comment)
  lemmatized_comment = []

  for sent in doc.sentences:
    lemmatized_comment.extend([word.lemma for word in sent.words])
  return lemmatized_comment

with concurrent.futures.ThreadPoolExecutor() as executor:
  results = executor.map(lemmatize_comments, comentarios_merge)

comments_lemm = list(results)


In [None]:
print('Example of comment after lemmatizaion:')
comments_lemm[2]

In [None]:
df_ecoas['Lemm'] = comments_lemm
df_ecoas.tail()

In [None]:
df_ecoas.shape

In [None]:
df_ecoas.tail()

In [None]:
df_ecoas.to_csv('df_ready.csv')

## 2: Initial Data Exploration

In this section we explore the dataset to reveal some of their characteristics.

First, a correlation analysis between the quantitative data is performed to reveal the relationship between some educational aspects and other variables like gender and overall mean (promedio acumulado).

In the second place, an alysis of the scores for the educational aspects on their distribution.

In third place, we move to the text to perform the initial exploration of the comments using Term Frequency Inverse Document Frequency (TF-IDF) and use that text representation to create a Word Cloud.

Lastly, we performed a Latent Dirichlet Allocation (LDA) analysis to observe if the LDA algorithm is capable of observe 8 categories/topics (in this case, educational aspects) and the coherience metrics that results from that analysis.

### Correlation Analysis

Using Perason and Point-biserial correlation coefficients.

In [None]:
df_ecoas=pd.read_csv('df_ready.csv', index_col=0)
df_ecoas.head()

In [None]:
comments = df_ecoas['Comentarios']

for a in range(len(comments)):
  if type(comments[a]) is not str:
    print(type(comments[a]), 'No es str: ', a)
    df_ecoas.drop(index=a)

comments = comments.str.lower()

nltk.download('stopwords')
stop = stopwords.words('spanish')
stop.remove('no')
stop.append('es')

# Tokenize the comments
for i in range(len(comments)):
  comments[i] = str(comments[i]).split()
  comments[i] = [word for word in comments[i] if word not in stop]

# Remove the symbols and replace them with blank spaces
sim = "!\"#$%&()*+-.,/:;<=>?@[\]^_`{|}~\n"

# Para cada símbolo, se remplaza por blank space
for c in range(len(comments)):
  for i in range(len(comments[c])):
    for j in sim:
      if comments[c][i].__contains__(j):
        comments[c][i] = comments[c][i].replace(j,'')

comments

In [None]:
df_ecoas.corr(numeric_only=True)

In [None]:
df_ecoas['MEJ'].value_counts() # Now lets focus on binary variables. MEJ

In [None]:
df_ecoas[['APR', 'DOM', 'EVA', 'MET', 'PRA', 'REC', 'RET', 'ASE', 'AVG']].corrwith(df_ecoas['MEJ'], method=stats.pointbiserialr)

In [None]:
df_ecoas['GÉNERO ALUMNO'].value_counts()

In [None]:
df_ecoas[['APR', 'DOM', 'EVA', 'MEJ', 'MET', 'PRA', 'REC', 'RET', 'ASE', 'AVG']].corrwith(df_ecoas['GÉNERO ALUMNO'], method=stats.pointbiserialr)

In [None]:
df_ecoas['Género del profesor'].value_counts()

In [None]:
df_ecoas[['APR', 'DOM', 'EVA', 'MEJ', 'MET', 'PRA', 'REC', 'RET', 'ASE', 'AVG']].corrwith(df_ecoas['Género del profesor'])

### Distribution Analysis

*We are using the Kernel Density Estimation, which is a non-parametric way to estimate the probability density function (PDF) of a random variable. This method is used to get a smooth curve (function) that is a good guess of the distribution of a set of data*

*As we can observe, the distribution of the average scores is highly umbalanced.*

In [None]:
df_ecoas['AVG'].plot(kind='kde')

In [None]:
df_ecoas['REC'].plot(kind='kde')

*For the question "Would you consider this professors as one of the best you have ever had?", measured by the aspect MEJ, we can observe that the umbalance also exists but not in the same distribution as the AVG score.*

*This motivated us to perform a threshold for the AVG score and observe if the umbalance could follow a similar distribution like the one below.*

*Remember that the MEJ aspect has:*
- *Considered as on the bests -> 63,162*
- *Not considered as on the bests -> 33,378*

In [None]:
yes = 63162
no = 33378
total = yes + no
print(f'The percentage of YES is: {round(yes/total*100,2)}%')
print(f'The percentage of NO is: {round(no/total*100,2)}%')

In [None]:
df_ecoas['MEJ'].plot(kind='kde')
# plt.legend(title='Type of comment')
plt.title("MEJ aspect")
plt.xlabel("Type")
plt.ylabel("Estimated Density")
plt.savefig('MEJ_kde.png', dpi=300)
plt.show()

In [None]:
df_ecoas['GÉNERO ALUMNO'].plot(kind='kde')
# plt.legend(title='Type of comment')
plt.title("Student Gender")
plt.xlabel("Type")
plt.ylabel("Estimated Density")
plt.savefig('student_gender_kde.png', dpi=300)
plt.show()

In [None]:
df_ecoas['Género del profesor'].plot(kind='kde')
# plt.legend(title='Type of comment')
plt.title("Professor Gender")
plt.xlabel("Type")
plt.ylabel("Estimated Density")
plt.savefig('professor_gender_kde.png', dpi=300)
plt.show()

*There is another imbalance in the "kind of comment" label, the majority of students wrote positive-only comments (1), followed by positive-and-negative comments (2) and only a small portion of the students wrote negative-only (0) comments.*

In [None]:
df_ecoas['Tipo Comentario'].plot(kind='kde')
# plt.legend(title='Type of comment')
plt.title("Type of comment")
plt.xlabel("Type")
plt.ylabel("Estimated Density")
plt.savefig('typecomm.png', dpi=300)
plt.show()

In [None]:
df_ecoas.head()

#### Using a threshold to change the data distribution
*We will use a scale to define a threshold and change the values in the aspects from a 11-point scale to binary variables*

In [None]:
scales = [[0, 9, 0], [9.1, 10, 1]]
scales

In [None]:
aspects.append('AVG')
aspects.remove('MEJ') # we remove MEJ because we don't want to use the scale on that column
aspects

In [None]:
df_before_scale = df_ecoas.copy() # We save a dataframe copy just in case

In [None]:
for asp in aspects:
    for sc in scales:
        df_ecoas[asp] = np.where((df_ecoas[asp] >= sc[0]) & (df_ecoas[asp] <= sc[1]), sc[2], df_ecoas[asp])
df_ecoas = df_ecoas.reset_index(drop=True)
df_ecoas.head()

*Now all the columns are more or less in the same proportions*

In [None]:
for asp in aspects:
  print(df_ecoas[asp].value_counts(normalize = True))
  print('- - - - - - - - - - - - ')
# MEJ is not in aspects so we print it separated
print(df_ecoas['MEJ'].value_counts(normalize = True))
print('- - - - - - - - - - - - ')

*This is how the distribution of the data set looked before.*

In [None]:
df_before_scale[aspects].plot(kind='kde')
# Add legend to plot
plt.title('Kernel Density Estimation Plot')
plt.legend(aspects, title='Aspects')
plt.xlabel("Score")
plt.ylabel("Estimated Density")
plt.savefig('initial_distribution_kde.png', dpi=300)
plt.show()

Using a threshold, help us in reducing the variability since as shown before, that highly correlated values are not beneficial for the model. The distribution shape will change completely, and will now have a binomial-like distribution where the data points are categorized into one of two groups 0 or 1

In [None]:
aspects.append('MEJ')
aspects

In [None]:
df_ecoas[aspects].plot(kind='kde')
# Add legend to plot
plt.title('Kernel Density Estimation Plot')
plt.legend(aspects, title='Aspects')
plt.xlabel("Score")
plt.ylabel("Estimated Density")
plt.savefig('scaled_distribution_kde.png', dpi=300)
plt.show()

The loss of information is evidenced in the new correlation matrix after the threshold is applied. As we can see, the data is now less correlated.

In [None]:
df_ecoas[aspects].corr(numeric_only=True)

## 3: Text Representations
We will generate different text representations of the comments: TF-IDF, Word Embeddings and Transformers Encoding

### 3.1:TF-IDF
Term Frequency Inverse Document Frequency

In [None]:
df_ecoas.head()

In [None]:
from ast import literal_eval
df_ecoas['Lemm'] = df_ecoas['Lemm'].apply(literal_eval)

In [None]:
commentsL = df_ecoas['Lemm']

In [None]:
# Merge comments
comentarios_merge=[None]*len(commentsL)
for i in range(len(commentsL)):
  comentarios_merge[i]=(TreebankWordDetokenizer().detokenize(list(commentsL[i])))

In [None]:
comentarios_merge[2]

In [None]:
aspects_classes = df_ecoas.drop(columns=['Comentarios', 'Lemm', 'Género del profesor', 'GÉNERO ALUMNO', 'PROM ACUMULADO EN PROFESIONAL','Tipo Comentario', 'AVG', 'MEJ'])
aspects_classes.head()

In [None]:
comments_train,comments_test,aspects_train, aspects_test = train_test_split(comentarios_merge, aspects_classes, test_size=0.25, random_state=30)
print('Train:', 'Comentarios->', len(comments_train), 'Aspectos->',  aspects_train.shape, '\nTest:', 'Comentarios->', len(comments_test),'Aspectos->', aspects_test.shape)

In [None]:
comments_train[:5]

In [None]:
vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(comments_train)
tf_x_test = vectorizer.transform(comments_test)

tf_idf = pd.DataFrame(tf_x_train.todense())
tf_idf.columns = vectorizer.get_feature_names_out()
tf_idf_matrix = tf_idf.T

tf_idf_matrix['Count'] = tf_idf_matrix.sum(axis=1)

# Top 15 words with the highest count
tf_idf_matrix = tf_idf_matrix.sort_values(by='Count', ascending=False).head(15)

print(tf_idf_matrix.drop(columns='Count').head(15))

In [None]:
# Wordcloud
# Combine all the text data into a single string
text_data = ' '.join(comments_train)

# Create a WordCloud object
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(text_data)

# collocations = False if you don't want to consider bigrams

# Display the word cloud
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('WordCloud_TFIDF.png', dpi=300)
plt.show()

### 3.2:Word-Embeddings

In [None]:
from gensim.models import Word2Vec
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Train Word2Vec model on the comments
model = Word2Vec(sentences=comments, vector_size=100, window=5, min_count=1, workers=4)

# You can access the word vectors using the 'wv' attribute of the model
word_vectors = model.wv

# Get the vector representation of a word
vector = word_vectors['maestro']
# print(vector)
# Find similar words
similar_words = word_vectors.most_similar('maestro')
print(f'This are the most similar words to `maestro`: {similar_words}')

In [None]:
# Other similar words
similar_words = word_vectors.most_similar('dominio')
print(f'This are the most similar words to `dominio`: {similar_words}')

similar_words = word_vectors.most_similar('actividad')
print(f'This are the most similar words to `actividad`: {similar_words}')

In [None]:
# Function to flatten the list of lists
def flatten_list(l):
    return [item for sublist in l for item in sublist]

In [None]:
# Aspects keywords # this one give me max similarity of 0.82 and min from those max of 0.15
MET_keywords = 'innovación tecnología herramientas explicaciones claridad precisión medios técnicas apoyo facilitación estilos ritmos aprendizaje recursos digitales aplicaciones interactivas plataformas educativas en línea dinámico participativo estrategias adaptación educación efectiva avanzadas enriquecimiento experiencia metodología actividades instructor estudiantes recursos digitales innovadoras tecnológicas estratégicas adaptativas participación'
PRA_keywords = 'casos proyectos problemas prácticas laboratorios talleres empresas organizaciones interacción profesionales teoría desafíos mundo real habilidades prácticas críticas resolución aplicación consolidación enfoque conocimiento preparación fomento desarrollo campo retos experiencia aprendizaje activo inmersión profundización comprensión contextualización aplicada realidad dinamismo pragmatismo soluciones'
ASE_keywords = 'interacción asesoría proceso éxito disponibilidad medios horarios ambiente respeto abierto apoyo dudas confianza profundo significativo potencial desarrollo profesor estudiantes entorno confianza motivación comprensión orientación consultas retroalimentación crecimiento diálogo comunicación empatía resolución preguntas accesibilidad dedicación involucramiento participación escucha asesoramiento'
EVA_keywords = 'evaluación retroalimentación fortalezas debilidades herramientas justa equitativa políticas criterios continua pruebas trabajos presentaciones orales progreso desarrollo mejora sistema integral métodos entendimiento contribución estudiante fomento calificación desempeño objetivos logros aprendizaje feedback autoevaluación peer-review competencias resultados análisis reflexión crecimiento mejora continua'
RET_keywords = 'reto intelectual estímulo interés motivación esfuerzo calidad aprendizaje crecimiento desafío tareas complejas significativas pensamiento crítico creatividad innovación conocimientos habilidades futuro profesional personal desarrollo solución problemas análisis reflexión autonomía perseverancia compromiso curiosidad exploración adquisición adaptabilidad resiliencia liderazgo colaboración ética'
APR_keywords = 'guía inspiración compromiso desarrollo crecimiento integral educador confianza pasión conocimiento entorno empoderamiento exploración cuestionamiento contribución proceso educativo centrado estudiante autonomía responsabilidad preparación desafíos futuros liderazgo ética colaboración innovación creatividad pensamiento crítico motivación perseverancia adaptabilidad resiliencia curiosidad explorador activo participativo'
DOM_keywords = 'dominio materia transmisión contenido educativo conocimientos experiencia campo claro interesante relevante conexiones conceptos curiosidad interés adaptación necesidades individuales enriquecimiento profundización experto profesional capacidad inspirador engagement contextualización aplicabilidad dinámica interactividad perspicacia visión comprensión analítica crítica metodológica didáctica pedagógica innovadora'

aspect_keywords = [MET_keywords, PRA_keywords, ASE_keywords, EVA_keywords, RET_keywords, APR_keywords, DOM_keywords]

# Tokenize the aspect keywords
for i in range(len(aspect_keywords)):
  aspect_keywords[i] = str(aspect_keywords[i]).split()
  aspect_keywords[i] = [word for word in aspect_keywords[i] if word not in stop]

print('Aspect keywords:', aspect_keywords)

In [None]:
# Aspects general descriptions # this one give me max similarity of nan and min from those max of nan
MET = 'La metodología y actividades de aprendizaje son cruciales para una educación efectiva. Este aspecto abarca cómo el instructor brinda explicaciones claras y precisas, utilizando medios y técnicas innovadoras, junto con herramientas tecnológicas avanzadas. Estas estrategias facilitan y apoyan el aprendizaje del estudiante, adaptándose a diferentes estilos y ritmos de aprendizaje. La incorporación de recursos digitales, aplicaciones interactivas y plataformas educativas en línea enriquece la experiencia educativa, permitiendo un aprendizaje más dinámico y participativo'
PRA = 'La comprensión de conceptos a través de su aplicación práctica es fundamental para consolidar el aprendizaje. Este enfoque incluye la resolución de casos, proyectos o problemas reales, así como la realización de prácticas en laboratorios, talleres, visitas a empresas u organizaciones y la interacción con profesionales del campo. Estas actividades no solo enriquecen el conocimiento teórico sino que también preparan a los estudiantes para los desafíos del mundo real, fomentando el desarrollo de habilidades prácticas, críticas y de resolución de problemas'
ASE = 'La interacción efectiva con el profesor y la asesoría recibida son esenciales para un proceso de aprendizaje exitoso. Este aspecto destaca la disponibilidad del profesor en medios y horarios previamente acordados, proporcionando un ambiente de aprendizaje respetuoso y abierto. La capacidad del profesor para apoyar en la resolución de dudas y fomentar un entorno de confianza promueve un aprendizaje más profundo y significativo, permitiendo a los estudiantes desarrollar su potencial al máximo'
EVA = 'El sistema de evaluación juega un papel crucial en el proceso educativo, proporcionando retroalimentación valiosa sobre las fortalezas y debilidades del estudiante. Utilizando un conjunto diverso de herramientas, este sistema permite evaluar de manera justa y equitativa, basándose en políticas y criterios claramente establecidos. La evaluación continua, las pruebas, los trabajos de curso y las presentaciones orales son ejemplos de métodos que contribuyen a un entendimiento integral del progreso del estudiante, fomentando su desarrollo y mejora continua.'
RET = 'El nivel de reto intelectual es vital para estimular el interés y la motivación de los estudiantes, exigiéndoles dar su mayor esfuerzo y cumplir con calidad en beneficio de su aprendizaje y crecimiento personal. Este aspecto implica desafiar a los estudiantes con tareas complejas y significativas, promoviendo el pensamiento crítico, la creatividad y la innovación. Al enfrentarse a retos intelectuales, los estudiantes no solo adquieren conocimientos sino que también desarrollan habilidades esenciales para su futuro profesional y personal'
APR = 'El papel del educador como guía del aprendizaje es inspirar y demostrar compromiso con el aprendizaje, desarrollo y crecimiento integral de los estudiantes. A través de su ejemplo y dedicación, el profesor inspira confianza y pasión por el conocimiento, facilitando un entorno de aprendizaje en el que los estudiantes se sienten empoderados para explorar, cuestionar y contribuir activamente a su propio proceso educativo. Este enfoque centrado en el estudiante promueve la autonomía y la responsabilidad en el aprendizaje, preparándolos para enfrentar desafíos futuros con confianza'
DOM = 'El dominio de la materia por parte del profesor es un componente esencial para transmitir efectivamente el contenido educativo. Este aspecto refleja los profundos conocimientos y experiencia del educador en su campo, permitiéndole presentar los temas de manera clara, interesante y relevante. Un sólido dominio facilita la generación de conexiones entre conceptos, estimulando la curiosidad y el interés de los estudiantes. Además, habilita al profesor para adaptar su enseñanza a las necesidades individuales de los estudiantes, enriqueciendo su experiencia de aprendizaje.'

aspect_descriptions = [MET, PRA, ASE, EVA, RET, APR, DOM]

# Tokenize the aspect keywords
for i in range(len(aspect_descriptions)):
  aspect_descriptions[i] = str(aspect_descriptions[i]).split()
  aspect_descriptions[i] = [word for word in aspect_descriptions[i] if word not in stop]

print('Aspect general description:', aspect_descriptions)

In [None]:
# Aspects descriptions for the students point of view
MET_student = 'Como estudiante, valoro cuando los profesores utilizan metodologías y actividades de aprendizaje que me brindan explicaciones claras y precisas. Aprecio especialmente el uso de medios y técnicas innovadoras, como herramientas tecnológicas, que hacen el aprendizaje más accesible y atractivo. Estas estrategias no solo facilitan mi comprensión de los temas, sino que también estimulan mi curiosidad y deseo de explorar más allá de lo básico, apoyando eficazmente mi proceso educativo'
PRA_student = 'La posibilidad de aplicar en la práctica lo que aprendo en teoría es fundamental para mí. A través de la resolución de casos reales, proyectos, y prácticas en laboratorios o talleres, siento que mi educación cobra sentido. Visitar empresas y organizaciones o interactuar con profesionales que aplican los temas discutidos en clase me ayuda a comprender mejor cómo se utilizan estos conceptos en el mundo real, preparándome para mi futuro profesional'
ASE_student = 'La interacción con mi profesor y la asesoría que recibo son clave para mi aprendizaje. Cuando un profesor está disponible para resolver mis dudas, ya sea en horarios convenidos o a través de medios digitales, me siento más apoyado y confiado en mi proceso educativo. Un ambiente de aprendizaje respetuoso y abierto, donde puedo expresar mis opiniones y preguntas sin temor, es esencial para mi desarrollo académico y personal'
EVA_student = 'El sistema de evaluación me ayuda a entender mis fortalezas y debilidades dentro del curso. Aprecio cuando se utilizan diversas herramientas de evaluación que ofrecen retroalimentación constructiva, basadas en criterios claros y justos. Esta retroalimentación me permite identificar áreas de mejora y reconocer mis logros, guiándome hacia un aprendizaje más efectivo y personalizado'
RET_student = 'Me siento genuinamente motivado cuando los desafíos intelectuales me exigen dar lo mejor de mí. Los retos que me llevan a cuestionar lo que sé y a esforzarme por entender y aplicar conceptos complejos enriquecen mi experiencia educativa. Este nivel de exigencia no solo impulsa mi crecimiento académico, sino que también fomenta mi desarrollo personal, preparándome para enfrentar futuros desafíos con confianza.'
APR_student = 'Cuando un profesor actúa como guía en mi aprendizaje, inspirándome y demostrando un compromiso genuino con mi desarrollo, siento que mi educación se personaliza y profundiza. Esta actitud no solo me motiva a aprender más, sino que también me enseña la importancia del compromiso y la pasión por el conocimiento. Un educador que guía, desafía e inspira es fundamental para mi crecimiento integral como estudiante'
DOM_student = 'El dominio de la materia por parte de mi profesor es imprescindible. Cuando un educador tiene un profundo conocimiento y experiencia en el tema que enseña, las clases son más claras, interesantes y relevantes. Este dominio permite al profesor conectar los conceptos con la vida real, despertando mi curiosidad y fomentando un aprendizaje más significativo. Además, un profesor que realmente entiende su materia puede adaptar la enseñanza a las necesidades de sus estudiantes, enriqueciendo nuestra experiencia educativa.'

aspects_studentpov = [MET_student, PRA_student, ASE_student, EVA_student, RET_student, APR_student, DOM_student]

# Tokenize the aspect keywords
for i in range(len(aspects_studentpov)):
  aspects_studentpov[i] = str(aspects_studentpov[i]).split()
  aspects_studentpov[i] = [word for word in aspects_studentpov[i] if word not in stop]

print('Aspect descriptions as student POV:', aspects_studentpov)

In [None]:
# Aspects exact questions in the SET
ecoa_questions = [
# MET
'metodología y actividades de aprendizaje (me brindó explicaciones claras y precisas, medios y técnicas innovadoras o herramientas tecnológicas que facilitaron y apoyaron mi aprendizaje)',
# PRA
'comprensión de conceptos en términos de su aplicación en la práctica (resolví casos, proyectos o problemas reales, realicé prácticas en laboratorios o talleres, visitas a empresas u organizaciones, o interactué con personas que trabajan aplicando los temas de la clase)',
# ASE
'interacción con el profesor y la asesoría recibida durante el proceso de aprendizaje (me apoyó para resolver dudas, el profesor estuvo disponible en medios y horarios previamente acordados, hubo un ambiente de aprendizaje respetuoso y abierto)',
# EVA
'sistema de evaluación (se utilizó un conjunto de herramientas que me dieron retroalimentación sobre mis fortalezas y debilidades en el curso con base en políticas y criterios establecidos oportunamente)',
# RET
'nivel de reto intelectual (me motivó y me exigió dar mi mayor esfuerzo y cumplir con calidad en beneficio de mi aprendizaje y mi crecimiento personal)',
# APR
'papel como guía del aprendizaje (me inspiró y demostró compromiso con mi aprendizaje, desarrollo y crecimiento integral)',
# DOM
'dominio de la materia (sus conocimientos y experiencia)',
]

# Tokenize the aspect keywords
for i in range(len(ecoa_questions)):
  ecoa_questions[i] = str(ecoa_questions[i]).split()
  ecoa_questions[i] = [word for word in ecoa_questions[i] if word not in stop]


print('Aspect ecoa questions:', ecoa_questions)

In [None]:
# We can now create a vector using the word vectors from the Word2Vec model

def get_embedding(descriptions): # Function to obtain the description vector
    asp_emb = []
    for description in descriptions:
        description_words = description  # Split the comment into individual words
        description_vector = np.zeros(model.vector_size)  # Initialize an array of zeros for the comment vector
        
        for word in description_words:
            if word in model.wv:
                description_vector += model.wv[word]  # Add the word vector to the comment vector
        asp_emb.append(description_vector)
    return asp_emb


In [None]:
aspects = ['MET', 'PRA', 'ASE', 'EVA', 'RET', 'APR', 'DOM', 'REC', 'MEJ']

In [None]:
# Example usage for the aspect keywords
aspects_keywords_vectors = get_embedding(aspect_keywords)

# Obtain the most similar words to the comment
similar_words = model.wv.similar_by_vector(aspects_keywords_vectors[0])

print(f"The most similar words to the aspect {aspects[0]}: '{aspect_keywords[0]}' are:")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

In [None]:
# Example usage for the aspect descriptions
aspects_description_vectors = get_embedding(aspect_descriptions)

# Obtain the most similar words to the comment
similar_words = model.wv.similar_by_vector(aspects_description_vectors[1])

print(f"The most similar words to the aspect {aspects[1]}: '{aspect_descriptions[1]}' are:")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

In [None]:
# Example usage for the aspect descriptions from student POV
aspects_studentpov_vectors = get_embedding(aspects_studentpov)

# Obtain the most similar words to the comment
similar_words = model.wv.similar_by_vector(aspects_studentpov_vectors[2])

print(f"The most similar words to the aspect {aspects[2]}: '{aspects_studentpov[2]}' are:")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

In [None]:
# Example usage for the aspect descriptions from the exact question in the SET
aspects_questions_vectors = get_embedding(ecoa_questions)

# Obtain the most similar words to the comment
similar_words = model.wv.similar_by_vector(aspects_questions_vectors[4])

print(f"The most similar words to the aspect {aspects[4]}: '{ecoa_questions[4]}' are:")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

#### PCA: Principal Component Analysis

In [None]:
# We can use PCA to reduce the number of dimensions of the vectors
pca = PCA(n_components=2)
# Perform PCA
pca_embeddings = pca.fit_transform(aspects_keywords_vectors)
def two_d_graph(pca_embeddings, title):
    colors = ("red", "green", "blue", "orange", "black", "purple", "yellow")
 
    # Create plot
    fig = plt.figure()
    ax = fig.gca()
 
    for data, color, group in zip(pca_embeddings, colors, aspects[:7]):
        x, y = data
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)
 
    plt.title(title)
    plt.legend(loc=1)
    plt.show()

two_d_graph(pca_embeddings, 'Reduced vocab set for aspects_keywords')

In [None]:
# We will do the same for the aspect descriptions
pca = PCA(n_components=2)
# Perform PCA
pca_embeddings = pca.fit_transform(aspects_description_vectors)
two_d_graph(pca_embeddings, 'Reduced vocab set for aspects descriptions')

In [None]:
# We will do the same for the aspect students POV descriptions
pca = PCA(n_components=2)
# Perform PCA
pca_embeddings = pca.fit_transform(aspects_studentpov_vectors)
two_d_graph(pca_embeddings, 'Reduced vocab set for aspects descriptions from student POV')

In [None]:
# Finally, we do the same for the aspect exact questions in the SET
pca = PCA(n_components=2)
# Perform PCA
pca_embeddings = pca.fit_transform(aspects_questions_vectors)
two_d_graph(pca_embeddings, 'Reduced vocab set for exact questions in the SET')

#### t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
# We don't want to use all the aspects because REC y MEJ are not in the Word2Vec model, so we will use only the first 7 aspects
aspects[:7]

In [None]:
np.array(aspects_keywords_vectors).shape

In [None]:
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(np.array(aspects_keywords_vectors).T)

In [None]:
pca_embeddings

In [None]:
def two_d_graph(pca_embeddings, title):
    colors = ("red", "green", "blue", "orange", "black", "purple", "yellow")
 
    # Create plot
    fig = plt.figure()
    ax = fig.gca()
 
    for data, color, group in zip(pca_embeddings, colors, aspects[:7]):
        x, y = data
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)
 
    plt.title(title)
    plt.legend(loc=1)
    plt.show()

two_d_graph(X_tsne, 'Reduced vocab set for aspects_keywords')

In [None]:
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(np.array(aspects_description_vectors).T)
two_d_graph(X_tsne, 'Reduced vocab set for aspects_descriptions_vectors')

In [None]:
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(np.array(aspects_studentpov_vectors).T)
two_d_graph(X_tsne, 'Reduced vocab set for aspects_studentpov_vectors')

In [None]:
tsne = TSNE(n_components=2, random_state=0)
X_tsne = tsne.fit_transform(np.array(aspects_questions_vectors).T)
two_d_graph(X_tsne, 'Reduced vocab set for aspects_questions_vectors (exact questions in the SET)')

#### Saving the word embeddings model

In [None]:
# We can now create a comment vector using the word vectors from the Word2Vec model and obtain the most similar words

# Define a function to obtain the comment vector
def get_comment_vector(comment):
    comment_words = comment  # Split the comment into individual words
    comment_vector = np.zeros(model.vector_size)  # Initialize an array of zeros for the comment vector
    
    for word in comment_words:
        if word in model.wv:
            comment_vector += model.wv[word]  # Add the word vector to the comment vector
    
    return comment_vector

# Example usage
comment_vector = get_comment_vector(comments[2])

# Obtain the most similar words to the comment
similar_words = model.wv.similar_by_vector(comment_vector)

print(f"The most similar words to the comment '{comments[2]}' are:")
for word, similarity in similar_words:
    print(f"{word}: {similarity}")


In [None]:
# Save the model
model.save('word2vec.model')

In [None]:
# Load the model
model = Word2Vec.load('word2vec.model')

### 3.3:BERT

In [None]:
df_ecoas.head()

In [None]:
comentarios_merge[:5]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

In [None]:
# This are the questions we will use to compare the comments and obtain the most similar ones
questions = ['MET', 'PRA', 'ASE', 'EVA', 'RET', 'APR', 'DOM']


In [None]:
encoded_questions = tokenizer(
    questions,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)

encoded_responses = tokenizer(
    comentarios_merge,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=128
)

In [None]:
encoded_questions['input_ids'].shape, encoded_responses['input_ids'].shape

In [None]:
encoded_responses_example = {
    'input_ids': encoded_responses['input_ids'][4],
    'attention_mask': encoded_responses['attention_mask'][4]
}

print(encoded_responses_example)


## 4:Topic Recognizion

To continue with this task, we will use the last text representation of the previous section: BERT.

We perfomed cosine similarity between the encoded questions and the encoded comments, this give us an idea of the aspects that the students are talking about in the comment.

In [None]:
# Determine the maximum sequence length for questions and answers
max_sequence_length = max(len(encoded_questions['input_ids'][0]), len(encoded_responses['input_ids'][0]))

# Initialize an empty array to store similarity scores
similarity_scores = np.zeros((len(questions), len(comentarios_merge)))

# Calculate cosine similarity for each question-answer pair
for i, question_embedding in enumerate(encoded_questions['input_ids']):
    for j, answer_embedding in enumerate(encoded_responses['input_ids']):
        # Pad the shorter sequence to match the maximum length
        padded_question = np.pad(question_embedding.numpy(), (0, max_sequence_length - len(question_embedding)), 'constant')
        padded_answer = np.pad(answer_embedding.numpy(), (0, max_sequence_length - len(answer_embedding)), 'constant')

        # Calculate cosine similarity between the padded embeddings
        similarity = cosine_similarity([padded_question], [padded_answer])[0][0]
        similarity_scores[i][j] = similarity


In [None]:
# We have the scores for similarity between comment and questions, 6 aspects for 98427 different comments
similarity_scores.shape

In [None]:
similarity_scores.T

In [None]:
most_likely_questions = np.argmax(similarity_scores.T, axis=1) # Find the index of the max value along the rows
mlikely_question = pd.DataFrame(most_likely_questions) # Convert to dataframe
questions_counts = mlikely_question.value_counts() # Count the number of times each question was the most similar

List_questions = []
for idx in range(len(questions)):
    List_questions.append([questions[idx], aspects[idx], questions_counts[idx]])

pd.set_option('colheader_justify', 'center') # Center the column's names

df_countQA = pd.DataFrame(List_questions, columns = ['Pregunta', 'Aspect Key', 'Count'])
print(df_countQA)

In [None]:
mlikely_question.rename(columns={0: "Cosine_Sim_MostLikeQ"}, inplace=True)
mlikely_question['MostLikeAsp'] = [aspects[i] for i in mlikely_question['Cosine_Sim_MostLikeQ']]

In [None]:
aspects[:7]

In [None]:
similarity_scores.T[0]

In [None]:
df_ecoas['mlq_numb'] = mlikely_question['Cosine_Sim_MostLikeQ'] # Add the most likely question number to the dataframe
df_ecoas['mlq_asp'] = mlikely_question['MostLikeAsp'] # Add the most likely question KEY to the dataframe
df_ecoas.head()

In [None]:
df_ecoas.tail()

Latent Dirichlet Allocation (LDA)

Latent Dirichlet Allocation (LDA) is a type of statistical model that is used to discover the abstract "topics" that occur in a collection of documents. It's widely used in natural language processing (NLP) for tasks like document classification, organizing large sets of documents, and helping in understanding the main themes (educational aspects in this case) in them.

In [None]:
commentsL[:5]

In [None]:
# Create dictionary and document-term matrix
dictionary = corpora.Dictionary(commentsL)
corpus = [dictionary.doc2bow(doc) for doc in commentsL]

In [None]:
# Build LDA model
num_topics = 8
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)

# Print topics and their top keywords
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

# Get topics for each document
topics_for_documents = [lda_model[doc] for doc in corpus]

In [None]:
# Define the number of topics
num_topics = 8

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(commentsL)

# Filter out tokens that appear in less than 10 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Convert the documents into a bag-of-words format: list of (token_id, token_count) tuples
corpus = [dictionary.doc2bow(doc) for doc in commentsL]

# Train the LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha='auto',
                     per_word_topics=True)

# Print the top keywords for each topic
for topic_id in range(num_topics):
    top_words = lda_model.show_topic(topic_id, topn=10)
    print(f"Topic {topic_id + 1}:")
    print([word[0] for word in top_words])

# Compute coherence score to evaluate the model
coherence_model_lda = CoherenceModel(model=lda_model, texts=commentsL, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

topics_for_documents = [lda_model[doc] for doc in corpus]

In [None]:
len(topics_for_documents)

In [None]:
commentsL[2]

In [None]:
topics_for_documents[0]

LDA hyper-parameters fine tuning:

Use different number of values of filter extremes and observe the performance on coherence score for different number of topics

In [None]:
# Define a range of no_below filter extreme values
below1 = 100
below2 = 1000
below3 = 2500
below4 = 0

#Define a range of no_above filter extreme values
above1 = 0.7
above2 = 0.65
above3 = 0.6
above4 = 1

extreme_values = [[below1, above1],[below2, above2],[below3, above3], [below4, above4]]

# Define a range of candidate numbers of topics
min_topics = 2
max_topics = 10
step_size = 1
topic_range = range(min_topics, max_topics + 1, step_size)

In [None]:
dictionary = corpora.Dictionary(commentsL)
print(len(dictionary))

In [None]:
for extremes in extreme_values:
    print('- - - - - - - - - - - - - - - - - - - - - - - ')
    coherence_scores = []
    for num_topics in topic_range:
        print(f'Number of topics: {num_topics} Extremes: {extremes}')
        # Create a dictionary representation of the documents.
        dictionary = corpora.Dictionary(commentsL)

        # Filter out tokens that appear in less than 10 documents or more than 50% of the documents
        dictionary.filter_extremes(no_below=extremes[0], no_above=extremes[1])
        dictionary.compactify()
        print(len(dictionary))

        # Convert the documents into a bag-of-words format: list of (token_id, token_count) tuples
        corpus = [dictionary.doc2bow(doc) for doc in commentsL]

In [None]:
for extremes in extreme_values:
    print('- - - - - - - - - - - - - - - - - - - - - - - ')
    coherence_scores = []

    # Create a dictionary representation of the documents.
    dictionary = corpora.Dictionary(commentsL)

    # Filter out tokens that appear in less than 10 documents or more than 50% of the documents
    dictionary.filter_extremes(no_below=extremes[0], no_above=extremes[1])
    dictionary.compactify()
    print(f'Len of the Dictionary: {len(dictionary)}')

    # Convert the documents into a bag-of-words format: list of (token_id, token_count) tuples
    corpus = [dictionary.doc2bow(doc) for doc in commentsL]

    for num_topics in topic_range:
        print(f'Number of topics: {num_topics} Extremes: {extremes}')


        # Train the LDA model
        lda_model = LdaModel(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

        coherence_model_lda = CoherenceModel(model=lda_model, texts=commentsL, dictionary=dictionary, coherence='c_v')
        coherence_scores.append(coherence_model_lda.get_coherence())

    # Plot the coherence scores
    plt.plot(topic_range, coherence_scores)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence Score")
    plt.title("Coherence Score vs. Number of Topics")
    plt.show()

I try the best number of topics that resulted from the experiments

In [None]:
# Define the number of topics
num_topics = 2

# Create a dictionary representation of the documents.
dictionary = corpora.Dictionary(commentsL)

# Filter out tokens that appear in less than 10 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=100, no_above=0.7)

# Convert the documents into a bag-of-words format: list of (token_id, token_count) tuples
corpus = [dictionary.doc2bow(doc) for doc in commentsL]

# Train the LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=30,
                     alpha='auto',
                     per_word_topics=True)

# Print the top keywords for each topic
for topic_id in range(num_topics):
    top_words = lda_model.show_topic(topic_id, topn=10)
    print(f"Topic {topic_id + 1}:")
    print([word[0] for word in top_words])

# Compute coherence score to evaluate the model
coherence_model_lda = CoherenceModel(model=lda_model, texts=commentsL, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score:', coherence_lda)

topics_for_documents = [lda_model[doc] for doc in corpus]

In [None]:
# Altough the coherence score is working properly, the coherence score is not roust enough to determine the presene of topics
# We will stick with the BERT cosine similarity to determine the most likely topic for each comment instead
topics_for_documents[2]

## 5: ML Classifiers

In [None]:
df_ecoas.head()

In [None]:
df_ecoas.shape

### 5.1: Using TF-IDF

In [None]:
aspects

In [None]:
aspects_classes = df_ecoas.drop(columns=['GÉNERO ALUMNO', 'PROM ACUMULADO EN PROFESIONAL', 'Género del profesor', 'Tipo Comentario', 'Comentarios', 'AVG','Lemm','mlq_numb', 'mlq_asp'], errors='ignore')
aspects_classes.head()

In [None]:
comentarios_merge[:5]

In [None]:
comments_train,comments_test,aspects_train, aspects_test = train_test_split(comentarios_merge, aspects_classes, test_size=0.25, random_state=30)
print('Train:', 'Comentarios->', len(comments_train), 'Aspectos->',  aspects_train.shape, '\nTest:', 'Comentarios->', len(comments_test),'Aspectos->', aspects_test.shape)

In [None]:
vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(comments_train)
tf_x_test = vectorizer.transform(comments_test)

#### Decision Tree

In [None]:
# Decision Tree
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  t = DecisionTreeClassifier(random_state = 1509654,criterion='entropy',max_depth=8)
  t.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = t.predict(tf_x_test)

  accList.append(t.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

#### Random Forest

In [None]:
# Random Forest
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  t = RandomForestClassifier(min_samples_leaf=8)
  t.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = t.predict(tf_x_test)

  accList.append(t.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

#### SVM

In [None]:
for asp in aspects:
    print(aspects_train[asp])

In [None]:
aspects = ['APR', 'REC', 'RET']

In [None]:
# SVM linear
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='linear', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

In [None]:
# SVM poly
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='poly', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

In [None]:
# SVM rbf
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='rbf', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

In [None]:
# SVM sigmoid
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='sigmoid', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

#### Naive-Bayes

In [None]:
# Naive Bayes
aspects_test = ['REC', 'MEJ']
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  mnb = MultinomialNB()
  mnb.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = mnb.predict(tf_x_test)

  accList.append(mnb.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

### 5.2: Using Word Embeddings

In [None]:
# Load the model
model = Word2Vec.load('word2vec.model')

# Define a function to obtain the comment vector
def get_comments_vectors(comments):
    comments_vectors = []
    for comment in comments:
        comment_words = comment  # Split the comment into individual words
        comment_vector = np.zeros(model.vector_size)  # Initialize an array of zeros for the comment vector
        
        for word in comment_words:
            if word in model.wv:
                comment_vector += model.wv[word]  # Add the word vector to the comment vector
        comments_vectors.append(comment_vector)
    return comments_vectors

# Example usage
comments_vectors = get_comments_vectors(comments)

In [None]:
print(comments_vectors[0])
print(len(comments_vectors))

In [None]:
comments_train,comments_test,aspects_train, aspects_test = train_test_split(comments_vectors, aspects_classes, test_size=0.25, random_state=30)
print('Train:', 'Comentarios->', len(comments_train), 'Aspectos->',  aspects_train.shape, '\nTest:', 'Comentarios->', len(comments_test),'Aspectos->', aspects_test.shape)

## 6: DL Classifiers

### 6.1 Using Word Embeddings

In [None]:
# SVM poly
aspects_test = ['REC', 'APR']
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects_test:
  svm = SVC(kernel='poly', random_state=12022024)
  svm.fit(comments_vectors, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects_test)):
  print('------------',aspects_test[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

#### Multilayer perceptron

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
aspectos = ['APR', 'DOM', 'REC', 'RET']
classN = ['Negative', 'Neutral', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []
grafos = []

for asp in aspectos:
  # Entrenar todos los modelos con cada aspecto usando MLP
  m = Sequential()
  m.add(Dense(128, input_shape=(100,), activation='relu'))
  m.add(Dense(64, activation='relu'))
  m.add(Dense(3, activation='softmax'))

  # Compile the model
  m.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  # Train the model
  m.fit(comments_train, aspects_train[asp], epochs=10, batch_size=32, validation_data=(comments_test, aspects_test[asp]))

  # Metrics
  # y_pred = m.predict_classes(comments_test)
  y_pred = np.argmax(m.predict(comments_test), axis=-1)

  # accList.append(svm.score(comments_test, aspects_test[asp]))
  # baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  # cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  # creportList.append(metrics.classification_report(aspects_test[asp], y_pred))
  
  # pred = svm.predict_proba(comments_test)
  # aucList.append(metrics.roc_auc_score(aspects_test[asp].to_numpy(), pred, multi_class='ovr'))

  cr = metrics.classification_report(aspects_test[asp], y_pred)
  creportList.append(cr)
  print(cr)

#### TODO: Convolutional Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Define the model
cnn = Sequential()

# Vocabulary size
vocab_size = len(model.wv.key_to_index)

# Embedding dimension
embedding_dim = model.vector_size

# Add an embedding layer
cnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=140))

# Add a 1D convolutional layer
cnn.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# Add a global max pooling layer
cnn.add(GlobalMaxPooling1D())

# Add a dense layer
cnn.add(Dense(units=64, activation='relu'))

# Add an output layer
cnn.add(Dense(units=1, activation='softmax'))

# Compile the model
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
cnn.summary()


## 7: Transformers