In [2]:
import pandas as pd
from sklearn.calibration import cross_val_predict
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [4]:
vectorizer_frecuencia = CountVectorizer()
vectorizer_binary = CountVectorizer(binary = True)
vectorizer_tfidf = TfidfVectorizer()

df_normalizacion = pd.read_pickle('df_tokenizacion_lematizacion_emojis_final.pkl')

features = ['Title_Opinion', 'acumuladopositivo', 'acumuladonegative']
numeric_features = ['acumuladopositivo', 'acumuladonegative']

X = df_normalizacion[features]
y = df_normalizacion['Polarity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(f'\nTamaño de X_train: {len(X_train)} documentos')
print(f'Tamaño de X_test: {len(X_test)} documentos')
print(f'Tamaño de y_train: {len(y_train)} etiquetas')
print(f'Tamaño de y_test: {len(y_test)} etiquetas\n')


Tamaño de X_train: 24169 documentos
Tamaño de X_test: 6043 documentos
Tamaño de y_train: 24169 etiquetas
Tamaño de y_test: 6043 etiquetas



# Regresion logistica

## Frecuencia

In [3]:
# Vectorización de texto
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

# Combinar características numéricas y vectorizadas
X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

# Inicializar el clasificador
clf_lr = LogisticRegression(max_iter = 10000)

# Inicializar KFold con el número deseado de divisiones (folds)
kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

# Definir la métrica F1-score para la validación cruzada
scoring_metric = make_scorer(f1_score, average='macro')

# Aplicar la validación cruzada y obtener las puntuaciones
cv_scores = cross_val_score(clf_lr, X_train_final, y_train, cv = kf, scoring = scoring_metric)

# Imprimir las puntuaciones de validación cruzada
print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)

# Imprimir la puntuación media y la desviación estándar de las puntuaciones
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.44677243 0.46789743 0.47388115 0.47588564 0.48445446]
F1-score Promedio: 0.46977822199434965


## Binario

In [4]:
X_train_vectorizer = vectorizer_binary.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_binary.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_lr = LogisticRegression(max_iter = 10000)

kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_lr, X_train_final, y_train, cv = kf, scoring = scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.45596859 0.44256214 0.46658067 0.46126232 0.48275254]
F1-score Promedio: 0.46182525389098145


## TF-IDF

In [5]:
X_train_vectorizer = vectorizer_tfidf.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_tfidf.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_lr = LogisticRegression(max_iter = 10000)

kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_lr, X_train_final, y_train, cv = kf, scoring = scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.40677536 0.42195651 0.4200832  0.42921319 0.44136328]
F1-score Promedio: 0.4238783087485397


# SVM

## Frecuencia

In [6]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_svm = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_svm, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.29656362 0.27510307 0.28622718 0.27422927 0.27897558]
F1-score Promedio: 0.2822197447429898


## Binario

In [7]:
X_train_vectorizer = vectorizer_binary.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_binary.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_svm = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_svm, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.36415662 0.37049847 0.36301339 0.34511615 0.37495746]
F1-score Promedio: 0.3635484186741677


## TF-IDF

In [8]:
X_train_vectorizer = vectorizer_tfidf.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_tfidf.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_svm = SVC()

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_svm, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.21747358 0.21229895 0.22071545 0.22908578 0.23143698]
F1-score Promedio: 0.22220214773023567


# MLP

## Frecuencia

In [9]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.45366304 0.47440909 0.46528245 0.4546367  0.47666915]
F1-score Promedio: 0.46493208848814715


In [5]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.4496908  0.45670306 0.46326104 0.44265294 0.47412617]
F1-score Promedio: 0.45728680214562933


In [6]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.44646816 0.48283483 0.44852098 0.44830692 0.4701955 ]
F1-score Promedio: 0.45926527698734654


## Binario

In [10]:
X_train_vectorizer = vectorizer_binary.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_binary.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.44075526 0.44182548 0.44855088 0.45364299 0.46997477]
F1-score Promedio: 0.45094987649351975


## TF-IDF

In [11]:
X_train_vectorizer = vectorizer_tfidf.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_tfidf.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=200)

kf = KFold(n_splits=5, shuffle=True, random_state=0)

scoring_metric = make_scorer(f1_score, average='macro')

cv_scores = cross_val_score(clf_mlp, X_train_final, y_train, cv=kf, scoring=scoring_metric)

print("Puntuaciones de Validación Cruzada (F1-score):", cv_scores)
print("F1-score Promedio:", cv_scores.mean())

Puntuaciones de Validación Cruzada (F1-score): [0.40964617 0.44088708 0.46218743 0.4434351  0.45954357]
F1-score Promedio: 0.44313987151099904


# Mejor resultado

In [12]:
X_train_vectorizer = vectorizer_frecuencia.fit_transform(X_train['Title_Opinion'])
X_test_vectorizer = vectorizer_frecuencia.transform(X_test['Title_Opinion'])

X_train_final = hstack([X_train_vectorizer, csr_matrix(X_train[numeric_features].values)])
X_test_final = hstack([X_test_vectorizer, csr_matrix(X_test[numeric_features].values)])

clf_lr = LogisticRegression(max_iter=10000)
clf_lr.fit(X_train_final, y_train)
 
y_pred = clf_lr.predict(X_test_final)

conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(conf_matrix)
print(classification_rep)

[[  54   16   14    8   12]
 [  21   37   43   23   21]
 [  18   34  170  111   89]
 [   6   16  101  430  610]
 [   0    4   48  411 3746]]
              precision    recall  f1-score   support

           1       0.55      0.52      0.53       104
           2       0.35      0.26      0.29       145
           3       0.45      0.40      0.43       422
           4       0.44      0.37      0.40      1163
           5       0.84      0.89      0.86      4209

    accuracy                           0.73      6043
   macro avg       0.52      0.49      0.50      6043
weighted avg       0.72      0.73      0.72      6043

