# Practica 6
***
* Gónzalez Chacón Monica
* López Salazar Esmeralda Leticia
* Rodriguez Nuñez Diego Eduardo

In [18]:
import pandas as pd
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score


In [19]:
datos = pd.read_excel("Rest_Mex_2022.xlsx")
datos

Unnamed: 0,Title,Opinion,Polarity,Attraction
0,Pésimo lugar,"Piensen dos veces antes de ir a este hotel, te...",1,Hotel
1,No vayas a lugar de Eddie,Cuatro de nosotros fuimos recientemente a Eddi...,1,Restaurant
2,Mala relación calidad-precio,seguiré corta y simple: limpieza\n- bad. Tengo...,1,Hotel
3,Minusválido? ¡No te alojes aquí!,Al reservar un hotel con multipropiedad Mayan ...,1,Hotel
4,Es una porqueria no pierdan su tiempo,"No pierdan su tiempo ni dinero, venimos porque...",1,Hotel
...,...,...,...,...
30207,Verdadera joya arquitectónica,"Es una construcción majestuosa, creo que de la...",5,Attractive
30208,Romántico,Muy al estilo de Romeo y Julieta es este sitio...,5,Attractive
30209,Parece un castillo,Ideal para subir las escalinatas y divisar su ...,5,Attractive
30210,Imperdible,"Es imperdible, de ahí puedes ver muy bien la c...",5,Attractive


In [20]:
datos.fillna('', inplace=True)

In [21]:
X = datos["Title"].astype(str) + " " + datos["Opinion"].astype(str)

In [22]:
y = datos['Polarity'].values

In [23]:
nlp = spacy.load("es_core_news_sm")

### Perceptron
***

#### Normalización de Texto

In [24]:
def Normalizar_dern(texto,combo):
    texto = texto.lower() if 'text_cleaning' in combo else texto

    doc = nlp(texto)
    tokens = []
    for token in doc:
        if not token.is_punct and not token.is_space:
            if 'stop_words' in combo and token.is_stop:
                continue
            tokens.append(token.lemma_ if 'lemmatization' in combo else token.text)
    return ' '.join(tokens) if 'tokenization' in combo else ' '.join([texto])

In [25]:
# procesos: tokenization, lemmatization, stop_words, text_cleaning
combo = [['text_cleaning','stop_words']]
resultados = {}
for c in combo:
    print(f'Procesando: {c}')
    X_preprocessed = X.apply(lambda x: Normalizar_dern(x,c))
    resultados[str(c)] = X_preprocessed

Procesando: ['text_cleaning', 'stop_words']


#### Representación de texto

In [26]:
representaciones = {}
for c in combo:
    print(f'Procesando: {c}')
    
    #TF-IDF
    vectorizer = TfidfVectorizer()
    X_vec = vectorizer.fit_transform(X_preprocessed)
    representaciones[f'{c} + TF-IDF'] = X_vec
    print(f'Combinación: {c} + TF-IDF\nPreview:', X_vec[:5])

    #TF-IDF + SVD
    svd = TruncatedSVD(n_components=100, random_state=0)
    X_vecf = svd.fit_transform(X_vec)
    representaciones[f'{c} + TF-IDF + SVD'] = X_vecf
    print(f'Combinación: {c} + TF-IDF + SVD\nPreview:', X_vecf[:5])

Procesando: ['text_cleaning', 'stop_words']
Combinación: ['text_cleaning', 'stop_words'] + TF-IDF
Preview:   (0, 33337)	0.09978576168494702
  (0, 22317)	0.06827191390358496
  (0, 23916)	0.05591830572067304
  (0, 16708)	0.15624226797000865
  (0, 24916)	0.10875820491737558
  (0, 36213)	0.08052543923570236
  (0, 36344)	0.08500061704350645
  (0, 31495)	0.11572810077144698
  (0, 43808)	0.10548372090055519
  (0, 9458)	0.12991076915432817
  (0, 27009)	0.14391119154302656
  (0, 49045)	0.09200715562303259
  (0, 49917)	0.1272404798022135
  (0, 27376)	0.1073045577827018
  (0, 49105)	0.08105378360309609
  (0, 36161)	0.04050080117850944
  (0, 30968)	0.04603306821194006
  (0, 37016)	0.06087908183210164
  (0, 43474)	0.06154700666377157
  (0, 38637)	0.10948868315173031
  (0, 23337)	0.06368050570674673
  (0, 34855)	0.08758126344806912
  (0, 30770)	0.15965376527839975
  (0, 19029)	0.056594748898312036
  (0, 19863)	0.07226911176957111
  :	:
  (4, 2803)	0.08061952375201326
  (4, 4677)	0.06009899188854153


#### División de datos

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_vecf, y, test_size=0.2, random_state=0, shuffle=True)

#### Folds

In [28]:
kf = KFold(n_splits=5)
f1_scores = []

for train_index, test_index in kf.split(X_train):
    X_train_kf, X_test_kf = X_train[train_index], X_train[test_index]
    y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]

    # Entrenar modelo
    model = MLPClassifier(hidden_layer_sizes=(200,100))
    model.fit(X_train_kf, y_train_kf)

    # Evaluar modelo
    y_pred = model.predict(X_test_kf)
    f1 = f1_score(y_test_kf, y_pred, average='macro')
    print(f'F1 Score: {f1}')
    # Guardar f1_score
    f1_scores.append(f1)
average_f1 = np.mean(f1_scores)
print(f'F1 Score promedio: {average_f1}')

F1 Score: 0.40796196349214797




F1 Score: 0.3893115262221973




F1 Score: 0.43240271400208774
F1 Score: 0.387678155767626
F1 Score: 0.3979934962148786
F1 Score promedio: 0.40306957113978753


#### Final (?

In [29]:
final_model = MLPClassifier(hidden_layer_sizes=(200,100))
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
final_f1 = f1_score(y_test, y_pred, average='macro')
print(f'F1 Score final: {final_f1}')
print(classification_report(y_test, y_pred))

F1 Score final: 0.40427357625247334
              precision    recall  f1-score   support

           1       0.35      0.32      0.34       104
           2       0.23      0.21      0.22       145
           3       0.29      0.30      0.30       422
           4       0.35      0.34      0.34      1163
           5       0.82      0.83      0.82      4209

    accuracy                           0.67      6043
   macro avg       0.41      0.40      0.40      6043
weighted avg       0.67      0.67      0.67      6043



