In [1]:
import pandas as pd
from sklearn.calibration import cross_val_predict
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [2]:
vectorizer_frecuencia = CountVectorizer()
vectorizer_binary = CountVectorizer(binary = True)
vectorizer_tfidf = TfidfVectorizer()

df_normalizacion = pd.read_pickle('df_tokenizacion_lematizacion_emociones.pkl')

features = ['Title_Opinion','__alegria__','__tristeza__','__enojo__','__repulsion__','__miedo__','__sorpresa__','acumuladopositivo', 'acumuladonegative']
numeric_features = ['acumuladopositivo', 'acumuladonegative','__alegria__','__tristeza__','__enojo__','__repulsion__','__miedo__','__sorpresa__']

X = df_normalizacion[features]
y = df_normalizacion['Polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(f'\nTamaño de X_train: {len(X_train)} documentos')
print(f'Tamaño de X_test: {len(X_test)} documentos')
print(f'Tamaño de y_train: {len(y_train)} etiquetas')
print(f'Tamaño de y_test: {len(y_test)} etiquetas\n')


Tamaño de X_train: 24169 documentos
Tamaño de X_test: 6043 documentos
Tamaño de y_train: 24169 etiquetas
Tamaño de y_test: 6043 etiquetas



# Regresion logistica

## Frecuencia

In [3]:
# Vectorización de texto
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

# Combinar características numéricas y vectorizadas
X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

# Inicializar el clasificador
clf_lr = LogisticRegression(max_iter = 10000)

# Inicializar KFold con el número deseado de divisiones (folds)
kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

# Definir la métrica F1-score para la validación cruzada
scoring_metric = make_scorer(f1_score, average='macro')

# Aplicar la validación cruzada y obtener las puntuaciones
cv_scores = cross_val_score(clf_lr, X_train_final, y_train, cv = kf, scoring = scoring_metric)

# Imprimir las puntuaciones de validación cruzada
print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)

# Imprimir la puntuación media y la desviación estándar de las puntuaciones
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.4514485  0.46663268 0.47502891 0.47197186 0.48131916]
F1-score Promedio: 0.46928022217270715


## Binario

In [4]:
X_train_vectorizer = vectorizer_binary.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_binary.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_lr = LogisticRegression(max_iter = 10000)

kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_lr, X_train_final, y_train, cv = kf, scoring = scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.4469848  0.44746882 0.45709268 0.4634629  0.47633941]
F1-score Promedio: 0.4582697210454281


## TF-IDF

In [5]:
X_train_vectorizer = vectorizer_tfidf.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_tfidf.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_lr = LogisticRegression(max_iter = 10000)

kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_lr, X_train_final, y_train, cv = kf, scoring = scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.41454532 0.42188393 0.42508969 0.43379639 0.43699923]
F1-score Promedio: 0.4264629110720174


# SVM

## Frecuencia

In [6]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_svm = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_svm, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.29238571 0.2730753  0.28131178 0.28585705 0.29416464]
F1-score Promedio: 0.2853588963273918


## Binario

In [7]:
X_train_vectorizer = vectorizer_binary.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_binary.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_svm = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_svm, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.35799956 0.35803997 0.35765518 0.35823345 0.3705168 ]
F1-score Promedio: 0.360488993544016


## TF-IDF

In [8]:
X_train_vectorizer = vectorizer_tfidf.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_tfidf.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_svm = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_svm, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.23036897 0.23873725 0.22904402 0.23741561 0.2451755 ]
F1-score Promedio: 0.23614827228001115


# MLP

## Frecuencia

In [9]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.43744381 0.45019299 0.44880103 0.4454098  0.47452307]
F1-score Promedio: 0.4512741379595326


## Binario

In [10]:
X_train_vectorizer = vectorizer_binary.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_binary.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.44332137 0.44945133 0.44443061 0.44495357 0.46089238]
F1-score Promedio: 0.44860985228238703


## TF-IDF

In [11]:
X_train_vectorizer = vectorizer_tfidf.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_tfidf.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.42410442 0.46120458 0.44632765 0.45307946 0.44818173]
F1-score Promedio: 0.44657956845187396


# Mejor resultado

In [12]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_lr = LogisticRegression(max_iter=10000)
clf_lr.fit(X_train_final, y_train)
 
y_pred = clf_lr.predict(X_test_final)

conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(conf_matrix)
print(classification_rep)

[[  53   16   14    8   13]
 [  21   37   44   24   19]
 [  19   33  169  113   88]
 [   6   15   98  433  611]
 [   0    3   49  413 3744]]
              precision    recall  f1-score   support

           1       0.54      0.51      0.52       104
           2       0.36      0.26      0.30       145
           3       0.45      0.40      0.42       422
           4       0.44      0.37      0.40      1163
           5       0.84      0.89      0.86      4209

    accuracy                           0.73      6043
   macro avg       0.52      0.49      0.50      6043
weighted avg       0.72      0.73      0.72      6043

