## Dependencies

In [None]:
import matplotlib.pyplot as plt

import numpy as np
from nltk.tokenize.treebank import TreebankWordDetokenizer

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from transformers import AutoTokenizer, AutoModelForMaskedLM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

from wordcloud import WordCloud

## Pipeline Method

For the pipeline method, we need to perform first a Topic Classification, since we don't have a labeled dataset with the topics/educational aspects for each comment and sentence, we will use the Sentence Similarity to obtain this part in a zero-shot style of classification.

On the second part, we perform sentiment analysis with three different approaches: ML, DL and Attention-based algorithms.

## 5: ML Classifiers

In [98]:
df_ecoas.head()

Unnamed: 0,APR,DOM,EVA,MEJ,MET,PRA,REC,RET,ASE,GÉNERO ALUMNO,PROM ACUMULADO EN PROFESIONAL,Género del profesor,Tipo Comentario,Comentarios,AVG,Lemm,mlq_numb,mlq_asp
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0,94.428,0.0,0,"Sabe explicar muy bien las cosas teoricas, pe...",0.0,"[saber, explicar, bien, cosa, teorico, hacer, ...",4,RET
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,95.968,0.0,0,Método anticuado de enseñar. Los temas podría...,0.0,"[método, anticuado, enseñar, tema, poder, hace...",4,RET
2,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0,96.408,0.0,1,"Sabe muchísimo del tema, muy preparada.",0.0,"[saber, muchísimo, tema, preparado]",4,RET
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0,94.981,0.0,2,buena maestra si lo recomiendo,1.0,"[buen, maestro, si, recomer]",4,RET
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,89.04,0.0,1,Tiene mucho conocimiento sobre los temas.,0.0,"[conocimiento, tema]",6,DOM


In [99]:
df_ecoas.shape

(96540, 18)

### 5.1: Using TF-IDF

In [100]:
aspects

['MET', 'PRA', 'ASE', 'EVA', 'RET', 'APR', 'DOM', 'REC', 'MEJ']

In [101]:
aspects_classes = df_ecoas.drop(columns=['GÉNERO ALUMNO', 'PROM ACUMULADO EN PROFESIONAL', 'Género del profesor', 'Tipo Comentario', 'Comentarios', 'AVG','Lemm','mlq_numb', 'mlq_asp'], errors='ignore')
aspects_classes.head()

Unnamed: 0,APR,DOM,EVA,MEJ,MET,PRA,REC,RET,ASE
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [102]:
comentarios_merge[:5]

['saber explicar bien cosa teorico hacer falta aterrizar tema practicar mercadotecnia interno poner realizar proyecto mercadotecnia interno empresa tener alguno ejemplo concreto acción poder implementar solo dar teoria basico',
 'método anticuado enseñar tema poder hacer el interesante alumno',
 'saber muchísimo tema preparado',
 'buen maestro si recomer',
 'conocimiento tema']

In [117]:
comments_train,comments_test,aspects_train, aspects_test = train_test_split(comentarios_merge, aspects_classes, test_size=0.25, random_state=30)
print('Train:', 'Comentarios->', len(comments_train), 'Aspectos->',  aspects_train.shape, '\nTest:', 'Comentarios->', len(comments_test),'Aspectos->', aspects_test.shape)

Train: Comentarios-> 72405 Aspectos-> (72405, 9) 
Test: Comentarios-> 24135 Aspectos-> (24135, 9)


In [104]:
vectorizer = TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(comments_train)
tf_x_test = vectorizer.transform(comments_test)

#### Decision Tree

In [105]:
# Decision Tree
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  t = DecisionTreeClassifier(random_state = 1509654,criterion='entropy',max_depth=8)
  t.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = t.predict(tf_x_test)

  accList.append(t.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ MET ------------
Balanced Acc:  0.65
              precision    recall  f1-score   support

         0.0       0.72      0.43      0.54     10848
         1.0       0.65      0.87      0.74     13287

    accuracy                           0.67     24135
   macro avg       0.69      0.65      0.64     24135
weighted avg       0.68      0.67      0.65     24135

------------ PRA ------------
Balanced Acc:  0.63
              precision    recall  f1-score   support

         0.0       0.68      0.40      0.50      9852
         1.0       0.68      0.87      0.76     14283

    accuracy                           0.68     24135
   macro avg       0.68      0.63      0.63     24135
weighted avg       0.68      0.68      0.66     24135

------------ ASE ------------
Balanced Acc:  0.62
              precision    recall  f1-score   support

         0.0       0.61      0.35      0.44      7698
         1.0       0.75      0.90      0.81     16437

    accuracy                    

#### Random Forest

In [106]:
# Random Forest
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  t = RandomForestClassifier(min_samples_leaf=8)
  t.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = t.predict(tf_x_test)

  accList.append(t.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ MET ------------
Balanced Acc:  0.69
              precision    recall  f1-score   support

         0.0       0.73      0.54      0.62     10848
         1.0       0.69      0.84      0.76     13287

    accuracy                           0.70     24135
   macro avg       0.71      0.69      0.69     24135
weighted avg       0.71      0.70      0.70     24135

------------ PRA ------------
Balanced Acc:  0.66
              precision    recall  f1-score   support

         0.0       0.72      0.45      0.55      9852
         1.0       0.70      0.88      0.78     14283

    accuracy                           0.70     24135
   macro avg       0.71      0.66      0.67     24135
weighted avg       0.71      0.70      0.69     24135

------------ ASE ------------
Balanced Acc:  0.6
              precision    recall  f1-score   support

         0.0       0.75      0.23      0.35      7698
         1.0       0.73      0.96      0.83     16437

    accuracy                     

#### SVM

In [107]:
for asp in aspects:
    print(aspects_train[asp])

48734    1.0
36669    1.0
74941    0.0
14345    1.0
55199    1.0
        ... 
66455    1.0
46220    0.0
48045    0.0
70053    1.0
38693    1.0
Name: MET, Length: 72405, dtype: float64
48734    1.0
36669    1.0
74941    0.0
14345    1.0
55199    1.0
        ... 
66455    1.0
46220    0.0
48045    0.0
70053    1.0
38693    1.0
Name: PRA, Length: 72405, dtype: float64
48734    1.0
36669    1.0
74941    0.0
14345    1.0
55199    1.0
        ... 
66455    1.0
46220    1.0
48045    0.0
70053    1.0
38693    1.0
Name: ASE, Length: 72405, dtype: float64
48734    1.0
36669    1.0
74941    0.0
14345    1.0
55199    1.0
        ... 
66455    1.0
46220    0.0
48045    0.0
70053    1.0
38693    1.0
Name: EVA, Length: 72405, dtype: float64
48734    1.0
36669    1.0
74941    0.0
14345    1.0
55199    1.0
        ... 
66455    1.0
46220    0.0
48045    0.0
70053    1.0
38693    1.0
Name: RET, Length: 72405, dtype: float64
48734    1.0
36669    1.0
74941    0.0
14345    1.0
55199    1.0
        ... 
66

In [108]:
aspects = ['APR', 'REC', 'RET']

In [109]:
# SVM linear
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='linear', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ APR ------------
Balanced Acc:  0.69
              precision    recall  f1-score   support

         0.0       0.72      0.49      0.58      8841
         1.0       0.75      0.89      0.81     15294

    accuracy                           0.74     24135
   macro avg       0.73      0.69      0.70     24135
weighted avg       0.74      0.74      0.73     24135

------------ REC ------------
Balanced Acc:  0.74
              precision    recall  f1-score   support

         0.0       0.75      0.58      0.66      8719
         1.0       0.79      0.89      0.84     15416

    accuracy                           0.78     24135
   macro avg       0.77      0.74      0.75     24135
weighted avg       0.78      0.78      0.77     24135

------------ RET ------------
Balanced Acc:  0.65
              precision    recall  f1-score   support

         0.0       0.68      0.42      0.52      8962
         1.0       0.72      0.88      0.79     15173

    accuracy                    

In [110]:
# SVM poly
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='poly', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ APR ------------
Balanced Acc:  0.68
              precision    recall  f1-score   support

         0.0       0.74      0.45      0.56      8841
         1.0       0.74      0.91      0.82     15294

    accuracy                           0.74     24135
   macro avg       0.74      0.68      0.69     24135
weighted avg       0.74      0.74      0.72     24135

------------ REC ------------
Balanced Acc:  0.73
              precision    recall  f1-score   support

         0.0       0.79      0.54      0.64      8719
         1.0       0.78      0.92      0.84     15416

    accuracy                           0.78     24135
   macro avg       0.78      0.73      0.74     24135
weighted avg       0.78      0.78      0.77     24135

------------ RET ------------
Balanced Acc:  0.64
              precision    recall  f1-score   support

         0.0       0.69      0.38      0.49      8962
         1.0       0.71      0.90      0.79     15173

    accuracy                    

In [111]:
# SVM rbf
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='rbf', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ APR ------------
Balanced Acc:  0.7
              precision    recall  f1-score   support

         0.0       0.74      0.51      0.60      8841
         1.0       0.76      0.90      0.82     15294

    accuracy                           0.75     24135
   macro avg       0.75      0.70      0.71     24135
weighted avg       0.75      0.75      0.74     24135

------------ REC ------------
Balanced Acc:  0.75
              precision    recall  f1-score   support

         0.0       0.77      0.60      0.67      8719
         1.0       0.80      0.90      0.85     15416

    accuracy                           0.79     24135
   macro avg       0.79      0.75      0.76     24135
weighted avg       0.79      0.79      0.78     24135

------------ RET ------------
Balanced Acc:  0.66
              precision    recall  f1-score   support

         0.0       0.70      0.44      0.54      8962
         1.0       0.73      0.89      0.80     15173

    accuracy                     

In [112]:
# SVM sigmoid
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  svm = SVC(kernel='sigmoid', random_state=12022024)
  svm.fit(tf_x_train, aspects_train[asp])

  # Metrics
  y_pred = svm.predict(tf_x_test)

  accList.append(svm.score(tf_x_test, aspects_test[asp]))
  baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  creportList.append(metrics.classification_report(aspects_test[asp], y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ APR ------------
Balanced Acc:  0.64
              precision    recall  f1-score   support

         0.0       0.55      0.53      0.54      8841
         1.0       0.73      0.75      0.74     15294

    accuracy                           0.67     24135
   macro avg       0.64      0.64      0.64     24135
weighted avg       0.67      0.67      0.67     24135

------------ REC ------------
Balanced Acc:  0.69
              precision    recall  f1-score   support

         0.0       0.62      0.59      0.60      8719
         1.0       0.77      0.80      0.79     15416

    accuracy                           0.72     24135
   macro avg       0.70      0.69      0.69     24135
weighted avg       0.72      0.72      0.72     24135

------------ RET ------------
Balanced Acc:  0.6
              precision    recall  f1-score   support

         0.0       0.51      0.49      0.50      8962
         1.0       0.70      0.72      0.71     15173

    accuracy                     

#### Naive-Bayes

In [120]:
# Naive Bayes
classN = ['Negative', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []

for asp in aspects:
  mnb = MultinomialNB()
  mnb.fit(tf_x_train, np.array(aspects_train[asp]))

  # Metrics
  y_pred = mnb.predict(tf_x_test)

  # accList.append(mnb.score(tf_x_test, np.array(aspects_test[asp])))
  # baccList.append(metrics.balanced_accuracy_score(np.array(aspects_test[asp]), y_pred))
  # cmatrixList.append(metrics.confusion_matrix(np.array(aspects_test[asp]), y_pred))
  creportList.append(metrics.classification_report(np.array(aspects_test[asp]), y_pred))

for a in range(len(aspects)):
  print('------------',aspects[a],'------------')
  # print('Accuracy: ', round(accList[a],2))
  # print('Balanced Acc: ', round(baccList[a],2))
  # print('Auc: ', round(aucList[a],2))
  # print('Confusion Matrix: \n', cmatrixList[a])
  print(creportList[a])

------------ APR ------------
              precision    recall  f1-score   support

         0.0       0.75      0.40      0.52      8841
         1.0       0.73      0.92      0.81     15294

    accuracy                           0.73     24135
   macro avg       0.74      0.66      0.67     24135
weighted avg       0.73      0.73      0.71     24135

------------ REC ------------
              precision    recall  f1-score   support

         0.0       0.78      0.48      0.59      8719
         1.0       0.76      0.92      0.83     15416

    accuracy                           0.76     24135
   macro avg       0.77      0.70      0.71     24135
weighted avg       0.77      0.76      0.75     24135

------------ RET ------------
              precision    recall  f1-score   support

         0.0       0.71      0.34      0.46      8962
         1.0       0.70      0.92      0.80     15173

    accuracy                           0.70     24135
   macro avg       0.71      0.63     

## 6: DL Classifiers

In [131]:
# Load the model
model = Word2Vec.load('./w2v_models/word2vec.model')

# Define a function to obtain the comment vector
def get_comments_vectors(comments):
    comments_vectors = []
    for comment in comments:
        comment_words = comment  # Split the comment into individual words
        comment_vector = np.zeros(model.vector_size)  # Initialize an array of zeros for the comment vector
        
        for word in comment_words:
            if word in model.wv:
                comment_vector += model.wv[word]  # Add the word vector to the comment vector
        comments_vectors.append(comment_vector)
    return comments_vectors

# Example usage
comments_vectors = get_comments_vectors(comments)
print(comments_vectors[0])
print(len(comments_vectors))
comments_train,comments_test,aspects_train, aspects_test = train_test_split(comments_vectors, aspects_classes, test_size=0.25, random_state=30)
print('Train:', 'Comentarios->', len(comments_train), 'Aspectos->',  aspects_train.shape, '\nTest:', 'Comentarios->', len(comments_test),'Aspectos->', aspects_test.shape)

[ -7.51805429  -2.43519413  11.76174551   1.36619275   2.3938665
 -21.16311133  14.98464436  21.87620687  -5.58261746 -12.2597249
 -16.82807582 -16.40841573 -19.81503848   8.59698934   6.14641747
  -3.7361375  -10.95644514 -18.8040156   -1.00841434  -7.30020604
   5.12455532  12.72212625  -1.48355504   4.31188828   2.56153004
   3.56353559   0.92789734 -17.41711313  -9.61646843  10.73029927
  -6.79679339   9.56441091  -9.17715265   4.57565456   8.0847787
  24.12119621  -6.22237498 -18.3494462    1.81133241 -11.53095767
  -0.05433323  -9.82575543   5.8945538    1.83027577   6.98393537
  -3.41586851 -12.02966948  -5.18901796  14.5433255    3.48189728
  -4.96234688 -14.21012796   6.82766623  -0.02806839   7.55959904
   7.99780803  14.40197773   7.47481544 -13.7115967   15.6626811
   3.83050488   6.90960872  -7.95972436 -12.4183944   -8.13754288
  -8.58462658  -6.29011625  -0.82961985  -5.13217044  14.44935046
  -0.48814558  -3.08686768  -4.42064369 -15.86174432   7.3131324
 -11.09348664  

### 6.1 Using Word Embeddings

#### Multilayer perceptron

In [None]:
aspectos = ['APR', 'DOM', 'REC', 'RET']
classN = ['Negative', 'Neutral', 'Positive']
accList = []
baccList= []
aucList = []
cmatrixList = []
creportList = []
grafos = []

for asp in aspectos:
  # Entrenar todos los modelos con cada aspecto usando MLP
  m = Sequential()
  m.add(Dense(128, input_shape=(100,), activation='relu'))
  m.add(Dense(64, activation='relu'))
  m.add(Dense(3, activation='softmax'))

  # Compile the model
  m.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  # Train the model
  m.fit(comments_train, aspects_train[asp], epochs=10, batch_size=32, validation_data=(comments_test, aspects_test[asp]))

  # Metrics
  # y_pred = m.predict_classes(comments_test)
  y_pred = np.argmax(m.predict(comments_test), axis=-1)

  # accList.append(svm.score(comments_test, aspects_test[asp]))
  # baccList.append(metrics.balanced_accuracy_score(aspects_test[asp], y_pred))
  # cmatrixList.append(metrics.confusion_matrix(aspects_test[asp], y_pred))
  # creportList.append(metrics.classification_report(aspects_test[asp], y_pred))
  
  # pred = svm.predict_proba(comments_test)
  # aucList.append(metrics.roc_auc_score(aspects_test[asp].to_numpy(), pred, multi_class='ovr'))

  cr = metrics.classification_report(aspects_test[asp], y_pred)
  creportList.append(cr)
  print(cr)

#### TODO: Convolutional Neural Network

In [135]:
# Define the model
cnn = Sequential()

# Vocabulary size
vocab_size = len(model.wv.key_to_index)

# Embedding dimension
embedding_dim = model.vector_size

# Add an embedding layer
cnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))

# Add a 1D convolutional layer
cnn.add(Conv1D(filters=128, kernel_size=5, activation='relu'))

# Add a global max pooling layer
cnn.add(GlobalMaxPooling1D())

# Add a dense layer
cnn.add(Dense(units=64, activation='relu'))

# Add an output layer
cnn.add(Dense(units=1, activation='softmax'))

# Compile the model
cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
cnn.summary()

## 7: Transformers