Enfoque del dataset de Dipromats basado en algoritmos clásicos de aprendizaje automático

In [1]:
### IMPORTS ###
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#from keras.utils import to_categorical

!pip install textaugment
from textaugment import EDA

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

Collecting textaugment
  Downloading textaugment-1.3.4-py3-none-any.whl (16 kB)
Collecting googletrans (from textaugment)
  Downloading googletrans-3.0.0.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans->textaugment)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans->textaugment)
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans->textaugment)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->g

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# Cargar Dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

PATH = "/content/drive/My Drive/TFM/Data/Dipromats/"

Mounted at /content/drive


In [3]:
### PARTICIÓN ###
df = pd.read_json(PATH + "train_en.json",  encoding='utf-8', encoding_errors='ignore')
df = df[['text', 'label_task1']]
df = df.rename(columns={"text": "Text", "label_task1": "Label"})
df.fillna(" ", inplace=True)

X_train = df['Text']
y_train = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=55, stratify=y_train)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

print('Tamaño conjunto de Entrenamiento:', len(X_train))
print('Tamaño conjunto de Evaluación:', len(X_test))

Tamaño conjunto de Entrenamiento: 6726
Tamaño conjunto de Evaluación: 1682


# Limpieza y Representación de Textos

In [4]:
### LIMPIEZA DE TEXTOS ###
stopwords_es = stopwords.words("english")
def clean_text(text):
    # transformar a minúscula
    text=str(text).lower()
    # tokenizar
    tokens=word_tokenize(text)
    # borrar stopwords
    tokens = [word for word in tokens if word not in stopwords_es]
    # usar los stems
    tokens = [PorterStemmer().stem(word) for word in tokens]
    # eliminamos las palabras con menos de 2 caráceres
    # ignoramos cualquier palabra que contenga un digito o un símbolo especial
    min_length = 2
    p = re.compile('^[a-zA-Z]+$');
    filtered_tokens=[]
    for token in tokens:
        if len(token)>=min_length and p.match(token):
            filtered_tokens.append(token)

    return filtered_tokens

# Data Augmentation

In [5]:
t = EDA()

x = 0
for i in range(3274):
    text = str(X_train[i])

    if x == 0:
        new_text = t.synonym_replacement(text)
        X_train = X_train.append(pd.Series([new_text]), ignore_index=True)
        y_train = y_train.append(pd.Series([y_train[i]]), ignore_index=True)
    if x == 1:
        new_text = t.random_swap(text)
        X_train = X_train.append(pd.Series([new_text]), ignore_index=True)
        y_train = y_train.append(pd.Series([y_train[i]]), ignore_index=True)
    if x == 2:
        new_text = t.random_deletion(text, p=0.2)
        X_train = X_train.append(pd.Series([new_text]), ignore_index=True)
        y_train = y_train.append(pd.Series([y_train[i]]), ignore_index=True)

    x = x + 1
    if x == 3:
        x = 0

print('Tamaño conjunto de Entrenamiento:', len(X_train))

  X_train = X_train.append(pd.Series([new_text]), ignore_index=True)
  y_train = y_train.append(pd.Series([y_train[i]]), ignore_index=True)
  X_train = X_train.append(pd.Series([new_text]), ignore_index=True)
  y_train = y_train.append(pd.Series([y_train[i]]), ignore_index=True)
  X_train = X_train.append(pd.Series([new_text]), ignore_index=True)
  y_train = y_train.append(pd.Series([y_train[i]]), ignore_index=True)


Tamaño conjunto de Entrenamiento: 10000


# Label Encoding

No es necesario hacer un encoding ya que viene etiquetado de manera numérica

# Bolsa de Palabras

In [6]:
### BOLSA DE PALABRAS ###
X_train = X_train.tolist()
X_test = X_test.tolist()

# entrenamos un modelo de bolsa de palabras
bow = CountVectorizer(analyzer=clean_text).fit(X_train)
# transformamos el conjunto de entrenamiento a bolsa de palabras
X_train_bow = bow.transform(X_train)
# transformamos el conjunto de evaluación a bolsa de palabras
X_test_bow = bow.transform(X_test)

print("Tamaño del vocabulario: ", len(bow.vocabulary_))

Tamaño del vocabulario:  11098


# TF-IDF

In [7]:
### TF-IDF ###
# entrenamos un modelo tf-idf
tfidf_transformer = TfidfTransformer().fit(X_train_bow)
# transformamos el conjunto de entrenamiento
X_train_tfidf = tfidf_transformer.transform(X_train_bow)
# transformamos el conjunto de entrenamiento
X_test_tfidf = tfidf_transformer.transform(X_test_bow)

# Clasificación Clásica
Se crea un pipeline que ejecuta una secuencia de procesos:


1.   La representación de los textos en bolsa de palabras (CountVectorizer), que recibe como entrada los textos, y se les aplica dentro del CountVectorizer la función clean_text para limpiarlos y reducir el ruido.
2.   La representación en tf-idf (TfidfTransformer), recibe como entrada la salida del proceso 1, y produce los vectores tf-idf.
3. El clasificador SVC, Logistic Regression o Random Forest Clasiffier.

In [8]:
### PIPELINE SVM ###
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tf', TfidfTransformer()),
    ('svm', SVC()),
])

# Parámetros para el algoritmo SVM
grid_params_svm = [{'svm__kernel': ['linear', 'rbf'],
                    'svm__C': [0.1, 1], # [0.1, 1, 10, 100, 1000]
                    'svm__gamma':  [1, 0.1] # [1, 0.1, 0.01, 0.001, 0.0001]
                    }]
gs = GridSearchCV(pipeline, param_grid=grid_params_svm,
                  scoring='accuracy', cv=4, verbose = 1)

# entrenamos el grid
gs.fit(X_train, y_train)
print('Los mejores parámetros son : %s' % gs.best_params_)
print('Mejor accuracy: %.3f' % gs.best_score_)
print(gs.best_estimator_)

best_svm = gs.best_estimator_
predictions = best_svm.predict(X_test)
print(classification_report(y_test, predictions))

Fitting 4 folds for each of 8 candidates, totalling 32 fits
Los mejores parámetros son : {'svm__C': 1, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Mejor accuracy: 0.911
Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x7f39b6df8820>)),
                ('tf', TfidfTransformer()), ('svm', SVC(C=1, gamma=1))])
              precision    recall  f1-score   support

       False       0.85      0.96      0.90      1287
        True       0.79      0.43      0.56       395

    accuracy                           0.84      1682
   macro avg       0.82      0.70      0.73      1682
weighted avg       0.83      0.84      0.82      1682



In [9]:
### PIPELINE LOGISTIC REGRESSION ###
pipeline2 = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tf', TfidfTransformer()),
    ('lr', LogisticRegression(random_state=0)),
])

# Parámetros para el algoritmo Logistic Regression
grid_params_lr = [{'lr__penalty': ['l1', 'l2'],
                    'lr__C': [1.0, 0.5],
                    'lr__solver':  ['liblinear']
                    }]
gs2 = GridSearchCV(pipeline2, param_grid=grid_params_lr,
                  scoring='accuracy', cv=5, verbose = 1)

# entrenamos el grid
gs2.fit(X_train, y_train)
print('Los mejores parámetros son : %s' % gs2.best_params_)
print('Mejor accuracy: %.3f' % gs2.best_score_)
print(gs2.best_estimator_)

best_svm = gs2.best_estimator_
predictions = best_svm.predict(X_test)
print( classification_report(y_test, predictions))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Los mejores parámetros son : {'lr__C': 1.0, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
Mejor accuracy: 0.844
Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x7f39b6df8820>)),
                ('tf', TfidfTransformer()),
                ('lr', LogisticRegression(random_state=0, solver='liblinear'))])
              precision    recall  f1-score   support

       False       0.84      0.96      0.90      1287
        True       0.78      0.42      0.54       395

    accuracy                           0.84      1682
   macro avg       0.81      0.69      0.72      1682
weighted avg       0.83      0.84      0.82      1682



In [10]:
### PIPELINE RANDOM FOREST ###
pipeline3 = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),
    ('tf', TfidfTransformer()),
    ('rfc', RandomForestClassifier(random_state=0)),
])

# Parámetros para el algoritmo Random Forest
grid_params_rfc = [{'rfc__criterion': ['gini', 'entropy'],
                    'rfc__max_depth': [9, 10],
                    'rfc__min_samples_split':  [10]
                    }]
gs3 = GridSearchCV(pipeline3, param_grid=grid_params_rfc,
                  scoring='accuracy', cv=5, verbose = 1)

# entrenamos el grid
gs3.fit(X_train, y_train)
print('Los mejores parámetros son : %s' % gs3.best_params_)
print('Mejor accuracy: %.3f' % gs3.best_score_)
print(gs3.best_estimator_)

best_svm = gs3.best_estimator_
predictions = best_svm.predict(X_test)
print( classification_report(y_test, predictions))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Los mejores parámetros son : {'rfc__criterion': 'gini', 'rfc__max_depth': 9, 'rfc__min_samples_split': 10}
Mejor accuracy: 0.764
Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x7f39b6df8820>)),
                ('tf', TfidfTransformer()),
                ('rfc',
                 RandomForestClassifier(max_depth=9, min_samples_split=10,
                                        random_state=0))])
              precision    recall  f1-score   support

       False       0.77      1.00      0.87      1287
        True       0.00      0.00      0.00       395

    accuracy                           0.77      1682
   macro avg       0.38      0.50      0.43      1682
weighted avg       0.59      0.77      0.66      1682



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
