Enfoque del dataset de noticias falsas basado en algoritmos clásicos de aprendizaje automático

In [None]:
### IMPORTS ###
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Cargar Dataset

In [None]:
### DRIVE DATA ###
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
PATH = "/content/drive/My Drive/TFM/Data/FakeNews_Task3_2022/"

Mounted at /content/drive


In [None]:
### CONJUNTOS DE ENTRENAMIENTO, VALIDACIÓN Y TEST ###
df_train = pd.read_csv(PATH + "Task3_train_dev/Task3_english_training.csv") 
df_train = df_train[['text', 'our rating']]
df_train = df_train.rename(columns = {"text": "Text", "our rating": "Label"})

df_dev = pd.read_csv(PATH + "Task3_train_dev/Task3_english_dev.csv") 
df_dev = df_dev[['text', 'our ratinge']]
df_dev = df_dev.rename(columns = {"text": "Text", "our ratinge": "Label"})

df_test = pd.read_csv(PATH + "Task3_Test/English_data_test_release_with_rating.csv") 
df_test = df_test[['text', 'our rating']]
df_test = df_test.rename(columns = {"text": "Text", "our rating": "Label"})

print('Tamaño Conjunto de Entrenamiento:', len(df_train['Label']))
print('Tamaño Conjunto de Validación:', len(df_dev['Label']))
print('Tamaño Conjunto de Evaluación:', len(df_test['Label']))

Tamaño Conjunto de Entrenamiento: 900
Tamaño Conjunto de Validación: 364
Tamaño Conjunto de Evaluación: 612


# Label Encoding

In [None]:
### LABEL ENCODING ###
y_train = df_train['Label'].tolist()
y_test = df_test['Label'].tolist()

LABELS = sorted(set(y_train))

idx2label={}
label2idx={}
for index, label in enumerate(LABELS):
    label2idx.update([(label, index)])
    idx2label.update([(index, label)])

print('Labels:', label2idx)
le = LabelEncoder()

Labels: {'False': 0, 'Other': 1, 'Partially false': 2, 'True': 3}


# Limpieza y Representación de Textos

In [None]:
### LIMPIEZA DE TEXTOS ###
stopwords_en = stopwords.words("english")
def clean_text(text):
    # transformar a minúscula
    text=str(text).lower()
    # tokenizar
    tokens=word_tokenize(text)
    # borrar stopwords
    tokens = [word for word in tokens if word not in stopwords_en]
    # usar los stems
    tokens = [PorterStemmer().stem(word) for word in tokens]
    # eliminamos las palabras con menos de 3 caráceres
    # ignoramos cualquier palabra que contenga un digito o un símbolo especial 
    min_length = 3
    p = re.compile('^[a-zA-Z]+$');
    filtered_tokens=[]
    for token in tokens:
        if len(token)>=min_length and p.match(token):
            filtered_tokens.append(token)
            
    return filtered_tokens

# Bolsa de Palabras

In [None]:
### BOLSA DE PALABRAS ###
X_train = df_train['Text'].tolist()
X_test = df_test['Text'].tolist()

# entrenamos un modelo de bolsa de palabras
bow = CountVectorizer(analyzer=clean_text).fit(X_train)
# transformamos el conjunto de entrenamiento a bolsa de palabras
X_train_bow = bow.transform(X_train)
# transformamos el conjunto de evaluación a bolsa de palabras
X_test_bow=bow.transform(X_test)

print("Tamaño del vocabulario: ", len(bow.vocabulary_))

Tamaño del vocabulario:  16672


# TF-IDF

In [None]:
### TF-IDF ###
# entrenamos un modelo tf-idf 
tfidf_transformer = TfidfTransformer().fit(X_train_bow)
# transformamos el conjunto de entrenamiento
X_train_tfidf = tfidf_transformer.transform(X_train_bow)
# transformamos el conjunto de entrenamiento
X_test_tfidf = tfidf_transformer.transform(X_test_bow)

# Clasificación Clásica
Se crea un pipeline que ejecuta una secuencia de procesos:


1.   La representación de los textos en bolsa de palabras (CountVectorizer), que recibe como entrada los textos, y se les aplica dentro del CountVectorizer la función clean_text para limpiarlos y reducir el ruido. 
2.   La representación en tf-idf (TfidfTransformer), recibe como entrada la salida del proceso 1, y produce los vectores tf-idf. 
3. El clasificador SVC, Logistic Regression o Random Forest Clasiffier.

In [None]:
### PIPELINE SVM ###
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),  
    ('tf', TfidfTransformer()),  
    ('svm', SVC()), 
])

# Parámetros para el algoritmo SVM
grid_params_svm = [{'svm__kernel': ['linear', 'rbf'], 
                    'svm__C': [0.1, 1], # [0.1, 1, 10, 100, 1000]
                    'svm__gamma':  [1, 0.1] # [1, 0.1, 0.01, 0.001, 0.0001]
                    }]
gs = GridSearchCV(pipeline, param_grid=grid_params_svm, 
                  scoring='accuracy', cv=5, verbose = 1)

# entrenamos el grid
gs.fit(X_train, y_train)
print('Los mejores parámetros son : %s' % gs.best_params_)
print('Mejor accuracy: %.3f' % gs.best_score_)
print(gs.best_estimator_)

best_svm = gs.best_estimator_
predictions = best_svm.predict(X_test)
print( classification_report(y_test, predictions, target_names=label2idx.keys()))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Los mejores parámetros son : {'svm__C': 1, 'svm__gamma': 1, 'svm__kernel': 'linear'}
Mejor accuracy: 0.594
Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x7f731d39f0d0>)),
                ('tf', TfidfTransformer()),
                ('svm', SVC(C=1, gamma=1, kernel='linear'))])
                 precision    recall  f1-score   support

          False       0.54      0.99      0.70       315
          Other       0.00      0.00      0.00        31
Partially false       0.21      0.12      0.16        56
           True       0.86      0.03      0.06       210

       accuracy                           0.53       612
      macro avg       0.40      0.29      0.23       612
   weighted avg       0.59      0.53      0.39       612



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
### PIPELINE LOGISTIC REGRESSION ###
pipeline2 = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),  
    ('tf', TfidfTransformer()),  
    ('lr', LogisticRegression(random_state=0)), 
])

# Parámetros para el algoritmo Logistic Regression
grid_params_lr = [{'lr__penalty': ['l1', 'l2'], 
                    'lr__C': [1.0, 0.5],
                    'lr__solver':  ['liblinear']
                    }]
gs2 = GridSearchCV(pipeline2, param_grid=grid_params_lr, 
                  scoring='accuracy', cv=5, verbose = 1)

# entrenamos el grid
gs2.fit(X_train, y_train)
print('Los mejores parámetros son : %s' % gs2.best_params_)
print('Mejor accuracy: %.3f' % gs2.best_score_)
print(gs2.best_estimator_)

best_svm = gs2.best_estimator_
predictions = best_svm.predict(X_test)
print( classification_report(y_test, predictions, target_names=label2idx.keys()))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Los mejores parámetros son : {'lr__C': 1.0, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
Mejor accuracy: 0.560
Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x7f731d39f0d0>)),
                ('tf', TfidfTransformer()),
                ('lr', LogisticRegression(random_state=0, solver='liblinear'))])
                 precision    recall  f1-score   support

          False       0.52      0.99      0.68       315
          Other       0.00      0.00      0.00        31
Partially false       0.25      0.05      0.09        56
           True       0.00      0.00      0.00       210

       accuracy                           0.52       612
      macro avg       0.19      0.26      0.19       612
   weighted avg       0.29      0.52      0.36       612



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
### PIPELINE RANDOM FOREST ###
pipeline3 = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_text)),  
    ('tf', TfidfTransformer()),  
    ('rfc', RandomForestClassifier(random_state=0)), 
])

# Parámetros para el algoritmo Random Forest
grid_params_rfc = [{'rfc__criterion': ['gini', 'entropy'], 
                    'rfc__max_depth': [9, 10],
                    'rfc__min_samples_split':  [10]
                    }]
gs3 = GridSearchCV(pipeline3, param_grid=grid_params_rfc, 
                  scoring='accuracy', cv=5, verbose = 1)

# entrenamos el grid
gs3.fit(X_train, y_train)
print('Los mejores parámetros son : %s' % gs3.best_params_)
print('Mejor accuracy: %.3f' % gs3.best_score_)
print(gs3.best_estimator_)

best_svm = gs3.best_estimator_
predictions = best_svm.predict(X_test)
print( classification_report(y_test, predictions, target_names=label2idx.keys()))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Los mejores parámetros son : {'rfc__criterion': 'gini', 'rfc__max_depth': 10, 'rfc__min_samples_split': 10}
Mejor accuracy: 0.538
Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x7f731d39f0d0>)),
                ('tf', TfidfTransformer()),
                ('rfc',
                 RandomForestClassifier(max_depth=10, min_samples_split=10,
                                        random_state=0))])
                 precision    recall  f1-score   support

          False       0.52      0.99      0.68       315
          Other       0.00      0.00      0.00        31
Partially false       0.40      0.04      0.07        56
           True       0.00      0.00      0.00       210

       accuracy                           0.51       612
      macro avg       0.23      0.26      0.19       612
   weighted avg       0.30      0.51      0.36       612



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
