In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

In [2]:
data = pd.read_csv('../Corpus/normaliced_data/politicES.csv', index_col=0)

X = data['tweets']

y = data['genero']

print(X.shape)
print(y.shape)

(180000,)
(180000,)


In [3]:
# Se divide el conjunto de entrenamiento y de orueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

print('X entrenamiento: {}, y entrenamiento: {}\nX prueba: {}, y prueba: {}'.format(X_train.shape,y_train.shape, X_test.shape, y_test.shape))

X entrenamiento: (144000,), y entrenamiento: (144000,)
X prueba: (36000,), y prueba: (36000,)


In [4]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [5]:
# Se crea una instancia de TfidfVectorizer() y se asigna a la variable vectorized
vectorized = TfidfVectorizer()
# x_train_tfidf se obtiene al aplicar el método fit_transform() de vectorized a los datos de entrenamiento x_train
x_train_tfidf = vectorized.fit_transform(X_train)
x_test_tfidf = vectorized.transform(X_test)

print('TF-IDF:\nX entrenamiento: {} X prueba: {}'.format(x_train_tfidf.shape, x_test_tfidf.shape))

TF-IDF:
X entrenamiento: (144000, 77038) X prueba: (36000, 77038)


In [6]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_oversampled_genero, y_train_oversampled_genero = smote.fit_resample(x_train_tfidf, y_train)
print(X_train_oversampled_genero.shape)
print(y_train_oversampled_genero.shape)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f91e3f8e320>
Traceback (most recent call last):
  File "/home/bruno-rg/anaconda3/envs/nlp/lib/python3.10/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/bruno-rg/anaconda3/envs/nlp/lib/python3.10/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/bruno-rg/anaconda3/envs/nlp/lib/python3.10/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/bruno-rg/anaconda3/envs/nlp/lib/python3.10/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


(191098, 77038)
(191098,)


In [7]:
from imblearn.under_sampling import RandomUnderSampler
# Aplicar undersampling
undersampler = RandomUnderSampler(random_state=42)

X_undersampled_genero, y_undersampled_genero = undersampler.fit_resample(x_train_tfidf, y_train)
print(X_undersampled_genero.shape)
print(y_undersampled_genero.shape)

(96902, 77038)
(96902,)


In [8]:
def Regresion(solver, penalty, x_t, y_t, x_tst, y_tst):
    lr_model = LogisticRegression(penalty=penalty, solver=solver, max_iter=1000)
    # Se ajustan los datos al modelo
    lr_model.fit(x_t, y_t)

    predictions = lr_model.predict(x_tst)

    # Calcular la exactitud
    print(solver)
    print(penalty)
    exactitud = round(accuracy_score(y_tst, predictions),2)
    presicion = round(precision_score(y_tst, predictions),2)
    f1 = round(f1_score(y_tst, predictions),2)
    recall = round(recall_score(y_tst, predictions),2)

    print("Exactitud:", exactitud)
    print("Precision:", presicion)
    print("F1:", f1)
    print("Recall:", recall)

In [9]:
parameters = {
    'lbfgs' : ['l2', None],
    'liblinear' : ['l1', 'l2'],
    'newton-cg' : ['l2', None],
}

In [10]:
# Sin balanceo de clase
for param in parameters:
    for penal in parameters[param]:
        Regresion(param, penal, x_train_tfidf, y_train, x_test_tfidf, y_test)

lbfgs
l2
Exactitud: 0.69
Precision: 0.71
F1: 0.8
Recall: 0.9


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
None
Exactitud: 0.65
Precision: 0.73
F1: 0.74
Recall: 0.75
liblinear
l1
Exactitud: 0.69
Precision: 0.71
F1: 0.8
Recall: 0.91
liblinear
l2
Exactitud: 0.69
Precision: 0.71
F1: 0.8
Recall: 0.9
newton-cg
l2
Exactitud: 0.69
Precision: 0.71
F1: 0.8
Recall: 0.9
newton-cg
None
Exactitud: 0.65
Precision: 0.72
F1: 0.74
Recall: 0.75




In [11]:
# Oversampling 
for param in parameters:
    for penal in parameters[param]:
        Regresion(param, penal, X_train_oversampled_genero, y_train_oversampled_genero, x_test_tfidf, y_test)

lbfgs
l2
Exactitud: 0.65
Precision: 0.77
F1: 0.72
Recall: 0.68


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
None
Exactitud: 0.62
Precision: 0.74
F1: 0.7
Recall: 0.66
liblinear
l1
Exactitud: 0.64
Precision: 0.77
F1: 0.71
Recall: 0.66
liblinear
l2
Exactitud: 0.65
Precision: 0.77
F1: 0.72
Recall: 0.68
newton-cg
l2
Exactitud: 0.65
Precision: 0.77
F1: 0.72
Recall: 0.68
newton-cg
None
Exactitud: 0.62
Precision: 0.74
F1: 0.7
Recall: 0.66




In [12]:
# Undersampling
for param in parameters:
    for penal in parameters[param]:
        Regresion(param, penal, X_undersampled_genero, y_undersampled_genero, x_test_tfidf, y_test)

lbfgs
l2
Exactitud: 0.64
Precision: 0.78
F1: 0.7
Recall: 0.64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lbfgs
None
Exactitud: 0.59
Precision: 0.74
F1: 0.65
Recall: 0.58
liblinear
l1
Exactitud: 0.63
Precision: 0.77
F1: 0.69
Recall: 0.63
liblinear
l2
Exactitud: 0.64
Precision: 0.78
F1: 0.7
Recall: 0.64
newton-cg
l2
Exactitud: 0.64
Precision: 0.78
F1: 0.7
Recall: 0.64
newton-cg
None
Exactitud: 0.58
Precision: 0.74
F1: 0.65
Recall: 0.58


